In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

import plotly.graph_objects as go

In [2]:
df = pd.read_json('../input/fraud-detection-dataset/transactions.txt', lines=True)

In [3]:
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,,0.0,,,,False,,,False,False
1,737265056,737265056,5000,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,,0.0,,,,True,,,False,False
2,737265056,737265056,5000,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
3,737265056,737265056,5000,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
4,830329091,830329091,5000,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,,0.0,,,,True,,,False,False


In [4]:
df.shape

(786363, 29)

In [5]:
df.columns

Index(['accountNumber', 'customerId', 'creditLimit', 'availableMoney',
       'transactionDateTime', 'transactionAmount', 'merchantName',
       'acqCountry', 'merchantCountryCode', 'posEntryMode', 'posConditionCode',
       'merchantCategoryCode', 'currentExpDate', 'accountOpenDate',
       'dateOfLastAddressChange', 'cardCVV', 'enteredCVV', 'cardLast4Digits',
       'transactionType', 'echoBuffer', 'currentBalance', 'merchantCity',
       'merchantState', 'merchantZip', 'cardPresent', 'posOnPremises',
       'recurringAuthInd', 'expirationDateKeyInMatch', 'isFraud'],
      dtype='object')

In [6]:
df.dtypes

accountNumber                 int64
customerId                    int64
creditLimit                   int64
availableMoney              float64
transactionDateTime          object
transactionAmount           float64
merchantName                 object
acqCountry                   object
merchantCountryCode          object
posEntryMode                 object
posConditionCode             object
merchantCategoryCode         object
currentExpDate               object
accountOpenDate              object
dateOfLastAddressChange      object
cardCVV                       int64
enteredCVV                    int64
cardLast4Digits               int64
transactionType              object
echoBuffer                   object
currentBalance              float64
merchantCity                 object
merchantState                object
merchantZip                  object
cardPresent                    bool
posOnPremises                object
recurringAuthInd             object
expirationDateKeyInMatch    

In [7]:
df.nunique()

accountNumber                 5000
customerId                    5000
creditLimit                     10
availableMoney              521915
transactionDateTime         776637
transactionAmount            66038
merchantName                  2490
acqCountry                       5
merchantCountryCode              5
posEntryMode                     6
posConditionCode                 4
merchantCategoryCode            19
currentExpDate                 165
accountOpenDate               1820
dateOfLastAddressChange       2184
cardCVV                        899
enteredCVV                     976
cardLast4Digits               5245
transactionType                  4
echoBuffer                       1
currentBalance              487318
merchantCity                     1
merchantState                    1
merchantZip                      1
cardPresent                      2
posOnPremises                    1
recurringAuthInd                 1
expirationDateKeyInMatch         2
isFraud             

In [8]:
#empty columns
df.drop(['merchantCity','merchantState','merchantZip','echoBuffer','posOnPremises','recurringAuthInd'],axis=1,inplace=True)

In [9]:
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,cardLast4Digits,transactionType,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,2015-03-14,2015-03-14,414,414,1803,PURCHASE,0.0,False,False,False
1,737265056,737265056,5000,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.0,True,False,False
2,737265056,737265056,5000,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.0,False,False,False
3,737265056,737265056,5000,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.0,False,False,False
4,830329091,830329091,5000,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,2015-08-06,2015-08-06,885,885,3143,PURCHASE,0.0,True,False,False


#### **Target col**

In [10]:
fig = go.Figure(data=[go.Pie(labels=df.isFraud, hole=.3)])
fig.add_annotation(text='isFraud',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(colors=['darkorange','blue'], line=dict(color='#000000', width=2)))
fig.show()

#### **Data preprocessing**

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var = ['merchantName','acqCountry','merchantCountryCode','posEntryMode','posConditionCode','merchantCategoryCode','transactionType','cardPresent','expirationDateKeyInMatch','isFraud']
for i in var:
    df[i] = le.fit_transform(df[i])

In [12]:
# converting in datetime format
df['transactionDateTime'] = pd.to_datetime(df['transactionDateTime'])
df['currentExpDate'] = pd.to_datetime(df['currentExpDate'])
df['accountOpenDate'] = pd.to_datetime(df['accountOpenDate'])
df['dateOfLastAddressChange'] = pd.to_datetime(df['dateOfLastAddressChange'])

In [13]:
# extractind year, month, day, hour, minute and seconds from datetime columns
df['transactionDateTime_year'] = df['transactionDateTime'].dt.year
df['transactionDateTime_month'] = df['transactionDateTime'].dt.month
df['transactionDateTime_day'] = df['transactionDateTime'].dt.day
df['transactionDateTime_hour'] = df['transactionDateTime'].dt.hour
df['transactionDateTime_minute'] = df['transactionDateTime'].dt.minute
df['transactionDateTime_second'] = df['transactionDateTime'].dt.second

df['currentExpDate_year'] = df['currentExpDate'].dt.year
df['currentExpDate_month'] = df['currentExpDate'].dt.month
df['currentExpDate_day'] = df['currentExpDate'].dt.day

df['accountOpenDate_year'] = df['accountOpenDate'].dt.year
df['accountOpenDate_month'] = df['accountOpenDate'].dt.month
df['accountOpenDate_day'] = df['accountOpenDate'].dt.day

df['dateOfLastAddressChange_year'] = df['dateOfLastAddressChange'].dt.year
df['dateOfLastAddressChange_month'] = df['dateOfLastAddressChange'].dt.month
df['dateOfLastAddressChange_day'] = df['dateOfLastAddressChange'].dt.day

In [14]:
# drop datetime column
df.drop('transactionDateTime',axis = 1,inplace = True)
df.drop('currentExpDate',axis = 1,inplace = True)
df.drop('accountOpenDate',axis = 1,inplace = True)
df.drop('dateOfLastAddressChange',axis = 1,inplace = True)

In [15]:
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,posConditionCode,...,transactionDateTime_second,currentExpDate_year,currentExpDate_month,currentExpDate_day,accountOpenDate_year,accountOpenDate_month,accountOpenDate_day,dateOfLastAddressChange_year,dateOfLastAddressChange_month,dateOfLastAddressChange_day
0,737265056,737265056,5000,5000.0,98.55,2086,4,4,1,1,...,32,2023,6,1,2015,3,14,2015,3,14
1,737265056,737265056,5000,5000.0,74.51,27,4,4,3,1,...,54,2024,2,1,2015,3,14,2015,3,14
2,737265056,737265056,5000,5000.0,7.47,1305,4,4,3,1,...,39,2025,8,1,2015,3,14,2015,3,14
3,737265056,737265056,5000,5000.0,7.47,1305,4,4,3,1,...,50,2025,8,1,2015,3,14,2015,3,14
4,830329091,830329091,5000,5000.0,71.18,2084,4,4,1,1,...,46,2029,10,1,2015,8,6,2015,8,6


In [16]:
df.dtypes

accountNumber                      int64
customerId                         int64
creditLimit                        int64
availableMoney                   float64
transactionAmount                float64
merchantName                       int64
acqCountry                         int64
merchantCountryCode                int64
posEntryMode                       int64
posConditionCode                   int64
merchantCategoryCode               int64
cardCVV                            int64
enteredCVV                         int64
cardLast4Digits                    int64
transactionType                    int64
currentBalance                   float64
cardPresent                        int64
expirationDateKeyInMatch           int64
isFraud                            int64
transactionDateTime_year           int64
transactionDateTime_month          int64
transactionDateTime_day            int64
transactionDateTime_hour           int64
transactionDateTime_minute         int64
transactionDateT

#### **VIF (Variable Inflation Factors)**
for multicollinearity detection

In [17]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [18]:
X = df.drop('isFraud',axis=1)
calc_vif(X)

Unnamed: 0,variables,VIF
0,accountNumber,inf
1,customerId,inf
2,creditLimit,inf
3,availableMoney,inf
4,transactionAmount,1.014572
5,merchantName,1.432368
6,acqCountry,1.758867
7,merchantCountryCode,1.758894
8,posEntryMode,1.000053
9,posConditionCode,1.000034


We can see here that the 'enteredCVV','customerId','availableMoney', 'currentBalance','cardCVV','accountNumber' and 'creditLimit'  have a high VIF value, meaning they can be predicted by other independent variables in the dataset.

#### **Fixing Multicollinearity**
Dropping one of the correlated features will help in bringing down the multicollinearity between correlated features:



In [19]:
df.drop(['enteredCVV','customerId','availableMoney'],axis=1,inplace=True)
X = df.drop('isFraud',axis=1)
calc_vif(X)

Unnamed: 0,variables,VIF
0,accountNumber,1.040712
1,creditLimit,1.777734
2,transactionAmount,1.014572
3,merchantName,1.432367
4,acqCountry,1.758867
5,merchantCountryCode,1.758894
6,posEntryMode,1.000051
7,posConditionCode,1.000034
8,merchantCategoryCode,2.30921
9,cardCVV,1.020937


#### **Model building**

I'm using pycaret. pycaret package is used to automate the major steps for evaluating and comparing machine learning algorithms for classification and regression. The main benefit of the library is that a lot can be achieved with very few lines of code and little manual configuration.

In [20]:
! pip install pycaret

Collecting pycaret
  Downloading pycaret-2.3.3-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 594 kB/s 
Collecting gensim<4.0.0
  Downloading gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 566 kB/s 
Collecting imbalanced-learn==0.7.0
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 19.8 MB/s 
Collecting pyod
  Downloading pyod-0.9.0.tar.gz (105 kB)
[K     |████████████████████████████████| 105 kB 30.2 MB/s 
Collecting mlflow
  Downloading mlflow-1.19.0-py3-none-any.whl (14.4 MB)
[K     |████████████████████████████████| 14.4 MB 25.6 MB/s 
Collecting scipy<=1.5.4
  Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 24.6 MB/s 
Collecting querystring-parser
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting alembic<=1.4.1

In [21]:
from pycaret.classification import setup, compare_models, blend_models, finalize_model, predict_model

In [22]:
def pycaret_model(train, target, n_select, fold, opt):
    print('Setup Your Data....')
    setup(data=train,
              target=target,
              numeric_imputation = 'mean',
              silent= True)
  
    print('Comparing Models....')
    best = compare_models(sort=opt, n_select=n_select, fold = fold,include = ['gbc','rf','et','xgboost','lightgbm','catboost'])
    # gbc = gradient boosting classifier
    # rf = random forest classifier
    # et = extra tree classifier
    
    print('Blending Models....')
    blended = blend_models(estimator_list= best, fold=fold, optimize=opt)
    pred = predict_model(blended)
    
    return pred

In [23]:
pycaret_model(df, 'isFraud', 5, 3, 'Accuracy')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9855,0.8477,0.079,0.9956,0.1464,0.1444,0.2784
1,0.9856,0.8559,0.0828,0.9958,0.1529,0.1509,0.2851
2,0.9855,0.8545,0.0793,1.0,0.147,0.1451,0.2796
Mean,0.9855,0.8527,0.0804,0.9972,0.1488,0.1468,0.281
SD,0.0,0.0036,0.0017,0.002,0.0029,0.0029,0.0029


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.9858,0.8635,0.1085,0.9976,0.1958,0.1933,0.3267


Unnamed: 0,accountNumber,transactionAmount,merchantName,cardCVV,cardLast4Digits,currentBalance,transactionDateTime_day,transactionDateTime_hour,transactionDateTime_minute,transactionDateTime_second,...,dateOfLastAddressChange_month_3,dateOfLastAddressChange_month_4,dateOfLastAddressChange_month_5,dateOfLastAddressChange_month_6,dateOfLastAddressChange_month_7,dateOfLastAddressChange_month_8,dateOfLastAddressChange_month_9,isFraud,Label,Score
0,511208960.0,24.049999,2054.0,538.0,1829.0,983.820007,7.0,20.0,43.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9979
1,515733760.0,158.460007,188.0,349.0,4007.0,0.000000,2.0,23.0,42.0,42.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0.9864
2,128520416.0,681.739990,2472.0,725.0,2232.0,3846.169922,21.0,20.0,14.0,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.8913
3,380680256.0,36.970001,1198.0,869.0,593.0,1196.119995,5.0,23.0,51.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9927
4,841357888.0,152.190002,2485.0,168.0,3452.0,15134.280273,11.0,16.0,3.0,7.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0.9642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235904,380680256.0,528.530029,600.0,869.0,593.0,3367.489990,21.0,7.0,39.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9571
235905,582676416.0,120.150002,482.0,385.0,7874.0,3849.229980,3.0,16.0,24.0,49.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9946
235906,828981760.0,94.449997,596.0,261.0,5804.0,1234.660034,8.0,15.0,37.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9929
235907,380680256.0,18.940001,600.0,869.0,593.0,4693.040039,2.0,6.0,18.0,36.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9922
