# Detecting Fraudulent Transactions

## Setting a custom display style

In [1]:
from IPython.core.display import HTML

def css_styling():
    styles = open("../styles/custom.css", "r").read()
    return HTML(styles)

css_styling()

## Importing the Libraries

In [2]:
import os
import json
import time
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option('precision', 4)
pd.set_option('max_colwidth', 20)
pd.options.display.float_format = '{:20,.2f}'.format

## Importing the Dataset

In [3]:
os.chdir('..')

In [4]:
DATA_PATH = os.getcwd() + '//data//raw//'
fname = 'transactions.txt'

In [5]:
def read_file(file_name):
    
    '''
    Parameters
    ----------
    file_name: file path to jsonlines txt
    
    Returns
    Pandas dataframe containing the transaction data
    '''
    
    data = []
    with open(DATA_PATH + fname) as input_file:
        for line in input_file:
            data.append(json.loads(line))
        return pd.DataFrame(data)

In [6]:
start = time.time()
df = read_file(DATA_PATH + fname)
stop = time.time()
print("Time Elapsed ->{} seconds".format(round(stop - start, 2)))

Time Elapsed ->37.69 seconds


### Saving the `df` to `csv`

In [7]:
df.to_csv(os.getcwd() + '//data//interim//transactions.csv', index=False)

## EDA

In [8]:
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,posConditionCode,merchantCategoryCode,currentExpDate,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,cardLast4Digits,transactionType,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,1,rideshare,06/2023,2015-03-14,2015-03-14,414,414,1803,PURCHASE,,0.0,,,,False,,,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,1,entertainment,02/2024,2015-03-14,2015-03-14,486,486,767,PURCHASE,,0.0,,,,True,,,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,1,mobileapps,08/2025,2015-03-14,2015-03-14,486,486,767,PURCHASE,,0.0,,,,False,,,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,1,mobileapps,08/2025,2015-03-14,2015-03-14,486,486,767,PURCHASE,,0.0,,,,False,,,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,1,fastfood,10/2029,2015-08-06,2015-08-06,885,885,3143,PURCHASE,,0.0,,,,True,,,False,False


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786363 entries, 0 to 786362
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accountNumber             786363 non-null  object 
 1   customerId                786363 non-null  object 
 2   creditLimit               786363 non-null  float64
 3   availableMoney            786363 non-null  float64
 4   transactionDateTime       786363 non-null  object 
 5   transactionAmount         786363 non-null  float64
 6   merchantName              786363 non-null  object 
 7   acqCountry                786363 non-null  object 
 8   merchantCountryCode       786363 non-null  object 
 9   posEntryMode              786363 non-null  object 
 10  posConditionCode          786363 non-null  object 
 11  merchantCategoryCode      786363 non-null  object 
 12  currentExpDate            786363 non-null  object 
 13  accountOpenDate           786363 non-null  o

In [10]:
df.describe()

Unnamed: 0,creditLimit,availableMoney,transactionAmount,currentBalance
count,786363.0,786363.0,786363.0,786363.0
mean,10759.46,6250.73,136.99,4508.74
std,11636.17,8880.78,147.73,6457.44
min,250.0,-1005.63,0.0,0.0
25%,5000.0,1077.42,33.65,689.91
50%,7500.0,3184.86,87.9,2451.76
75%,15000.0,7500.0,191.48,5291.1
max,50000.0,50000.0,2011.54,47498.81


### Coding whitespaces as `NaN`s

In [11]:
df_temp = df.replace(r'^\s*$', np.nan, regex=True)
df_temp.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,posConditionCode,merchantCategoryCode,currentExpDate,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,cardLast4Digits,transactionType,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,1,rideshare,06/2023,2015-03-14,2015-03-14,414,414,1803,PURCHASE,,0.0,,,,False,,,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,1,entertainment,02/2024,2015-03-14,2015-03-14,486,486,767,PURCHASE,,0.0,,,,True,,,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,1,mobileapps,08/2025,2015-03-14,2015-03-14,486,486,767,PURCHASE,,0.0,,,,False,,,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,1,mobileapps,08/2025,2015-03-14,2015-03-14,486,486,767,PURCHASE,,0.0,,,,False,,,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,1,fastfood,10/2029,2015-08-06,2015-08-06,885,885,3143,PURCHASE,,0.0,,,,True,,,False,False


### Removing columns containing only `NaN`s

In [12]:
print("Previous # of Columns ->{}".format(df_temp.shape[1]))
df_temp = df_temp.dropna(axis=1, how='all')
print("Latest # of Columns ->{}".format(df_temp.shape[1]))

Previous # of Columns ->29
Latest # of Columns ->23


### Missing Data

In [13]:
# msno.matrix(df_temp, figsize=(28, 20), color=(.33, .33, .34));

In [14]:
df_temp['acqCountry'] = df_temp['acqCountry'].fillna(df_temp['acqCountry'].mode()[0])
df_temp['merchantCountryCode'] = df_temp['merchantCountryCode'].fillna(df_temp['merchantCountryCode'].mode()[0])
df_temp['posEntryMode'] = df_temp['posEntryMode'].fillna(df_temp['posEntryMode'].mode()[0])
df_temp['transactionType'] = df_temp['transactionType'].fillna(df_temp['transactionType'].mode()[0])

In [15]:
# msno.matrix(df_temp, figsize=(28, 20), color=(.33, .33, .34));

In [16]:
df_temp.isnull().any()

accountNumber               False
customerId                  False
creditLimit                 False
availableMoney              False
transactionDateTime         False
transactionAmount           False
merchantName                False
acqCountry                  False
merchantCountryCode         False
posEntryMode                False
posConditionCode             True
merchantCategoryCode        False
currentExpDate              False
accountOpenDate             False
dateOfLastAddressChange     False
cardCVV                     False
enteredCVV                  False
cardLast4Digits             False
transactionType             False
currentBalance              False
cardPresent                 False
expirationDateKeyInMatch    False
isFraud                     False
dtype: bool

### Column Data Types

In [17]:
df_temp.dtypes

accountNumber                object
customerId                   object
creditLimit                 float64
availableMoney              float64
transactionDateTime          object
transactionAmount           float64
merchantName                 object
acqCountry                   object
merchantCountryCode          object
posEntryMode                 object
posConditionCode             object
merchantCategoryCode         object
currentExpDate               object
accountOpenDate              object
dateOfLastAddressChange      object
cardCVV                      object
enteredCVV                   object
cardLast4Digits              object
transactionType              object
currentBalance              float64
cardPresent                    bool
expirationDateKeyInMatch       bool
isFraud                        bool
dtype: object

In [18]:
target = 'isFraud'
num_dtypes = [i for i in df_temp.select_dtypes(include = np.number).columns]
obj_dtypes = [i for i in df_temp.columns if i not in num_dtypes and i != target]

In [19]:
num_dtypes

['creditLimit', 'availableMoney', 'transactionAmount', 'currentBalance']

In [20]:
obj_dtypes

['accountNumber',
 'customerId',
 'transactionDateTime',
 'merchantName',
 'acqCountry',
 'merchantCountryCode',
 'posEntryMode',
 'posConditionCode',
 'merchantCategoryCode',
 'currentExpDate',
 'accountOpenDate',
 'dateOfLastAddressChange',
 'cardCVV',
 'enteredCVV',
 'cardLast4Digits',
 'transactionType',
 'cardPresent',
 'expirationDateKeyInMatch']

In [21]:
len(num_dtypes) + len(obj_dtypes) == df_temp.shape[1] - 1

True

## Buiding a Classifier

### Split into $X$ and $y$

In [22]:
X = df_temp.filter(items=num_dtypes + obj_dtypes)
y = df_temp.filter(items=[target])

In [23]:
X.head()

Unnamed: 0,creditLimit,availableMoney,transactionAmount,currentBalance,accountNumber,customerId,transactionDateTime,merchantName,acqCountry,merchantCountryCode,posEntryMode,posConditionCode,merchantCategoryCode,currentExpDate,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,cardLast4Digits,transactionType,cardPresent,expirationDateKeyInMatch
0,5000.0,5000.0,98.55,0.0,737265056,737265056,2016-08-13T14:27:32,Uber,US,US,2,1,rideshare,06/2023,2015-03-14,2015-03-14,414,414,1803,PURCHASE,False,False
1,5000.0,5000.0,74.51,0.0,737265056,737265056,2016-10-11T05:05:54,AMC #191138,US,US,9,1,entertainment,02/2024,2015-03-14,2015-03-14,486,486,767,PURCHASE,True,False
2,5000.0,5000.0,7.47,0.0,737265056,737265056,2016-11-08T09:18:39,Play Store,US,US,9,1,mobileapps,08/2025,2015-03-14,2015-03-14,486,486,767,PURCHASE,False,False
3,5000.0,5000.0,7.47,0.0,737265056,737265056,2016-12-10T02:14:50,Play Store,US,US,9,1,mobileapps,08/2025,2015-03-14,2015-03-14,486,486,767,PURCHASE,False,False
4,5000.0,5000.0,71.18,0.0,830329091,830329091,2016-03-24T21:04:46,Tim Hortons #947751,US,US,2,1,fastfood,10/2029,2015-08-06,2015-08-06,885,885,3143,PURCHASE,True,False


In [24]:
y_new = y * 1
y_new.head()

Unnamed: 0,isFraud
0,0
1,0
2,0
3,0
4,0


### Simplistic model

**drop the following columns**

``` 
'accountNumber', 'customerId', 'transactionDateTime'
'merchantName', 'acqCountry', 'merchantCountryCode', 'posEntryMode','posConditionCode'
'accountOpenDate','dateOfLastAddressChange','cardCVV','enteredCVV','cardLast4Digits'
```

In [25]:
drop_cols = ['accountNumber','customerId','transactionDateTime','merchantName','acqCountry','merchantCountryCode','posEntryMode','accountOpenDate','dateOfLastAddressChange','cardCVV','enteredCVV', 'cardLast4Digits','posConditionCode','currentExpDate']

X_new = X.drop(columns=drop_cols)

In [26]:
obj_dtypes = [i for i in X_new.select_dtypes(include = [np.object, np.bool]).columns]

In [27]:
X_new.head()

Unnamed: 0,creditLimit,availableMoney,transactionAmount,currentBalance,merchantCategoryCode,transactionType,cardPresent,expirationDateKeyInMatch
0,5000.0,5000.0,98.55,0.0,rideshare,PURCHASE,False,False
1,5000.0,5000.0,74.51,0.0,entertainment,PURCHASE,True,False
2,5000.0,5000.0,7.47,0.0,mobileapps,PURCHASE,False,False
3,5000.0,5000.0,7.47,0.0,mobileapps,PURCHASE,False,False
4,5000.0,5000.0,71.18,0.0,fastfood,PURCHASE,True,False


### One-Hot Encoding/LabelEncoding

In [28]:
# # X_new.isnull().any()
columnTransformer = ColumnTransformer([('encoder', 
                                        OneHotEncoder(drop='first'), 
                                        obj_dtypes)], 
                                      remainder='passthrough') 
  
X_data = columnTransformer.fit_transform(X_new)
# # drop_enc = OneHotEncoder(drop='first')

# # X_enc = drop_enc.fit_transform(X_new)
# # X_enc
# # X_test_enc = drop_enc.transform(X_test)
# # drop_enc.categories_

In [29]:
columnTransformer.transformers[0]

('encoder',
 OneHotEncoder(categories='auto', drop='first', dtype=<class 'numpy.float64'>,
               handle_unknown='error', sparse=True),
 ['merchantCategoryCode',
  'transactionType',
  'cardPresent',
  'expirationDateKeyInMatch'])

In [30]:
le = LabelEncoder()

for i in obj_dtypes:
    X_new[i] = le.fit_transform(X_new[i])

X_new = pd.get_dummies(data=X_new, columns=obj_dtypes)
X_new.head()

Unnamed: 0,creditLimit,availableMoney,transactionAmount,currentBalance,merchantCategoryCode_0,merchantCategoryCode_1,merchantCategoryCode_2,merchantCategoryCode_3,merchantCategoryCode_4,merchantCategoryCode_5,merchantCategoryCode_6,merchantCategoryCode_7,merchantCategoryCode_8,merchantCategoryCode_9,merchantCategoryCode_10,merchantCategoryCode_11,merchantCategoryCode_12,merchantCategoryCode_13,merchantCategoryCode_14,merchantCategoryCode_15,merchantCategoryCode_16,merchantCategoryCode_17,merchantCategoryCode_18,transactionType_0,transactionType_1,transactionType_2,cardPresent_0,cardPresent_1,expirationDateKeyInMatch_0,expirationDateKeyInMatch_1
0,5000.0,5000.0,98.55,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0
1,5000.0,5000.0,74.51,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
2,5000.0,5000.0,7.47,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0
3,5000.0,5000.0,7.47,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0
4,5000.0,5000.0,71.18,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0


In [31]:
X_new.head()

Unnamed: 0,creditLimit,availableMoney,transactionAmount,currentBalance,merchantCategoryCode_0,merchantCategoryCode_1,merchantCategoryCode_2,merchantCategoryCode_3,merchantCategoryCode_4,merchantCategoryCode_5,merchantCategoryCode_6,merchantCategoryCode_7,merchantCategoryCode_8,merchantCategoryCode_9,merchantCategoryCode_10,merchantCategoryCode_11,merchantCategoryCode_12,merchantCategoryCode_13,merchantCategoryCode_14,merchantCategoryCode_15,merchantCategoryCode_16,merchantCategoryCode_17,merchantCategoryCode_18,transactionType_0,transactionType_1,transactionType_2,cardPresent_0,cardPresent_1,expirationDateKeyInMatch_0,expirationDateKeyInMatch_1
0,5000.0,5000.0,98.55,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0
1,5000.0,5000.0,74.51,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
2,5000.0,5000.0,7.47,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0
3,5000.0,5000.0,7.47,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0
4,5000.0,5000.0,71.18,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0


### Train-Test Split

In [36]:
 X_train, X_test, y_train, y_test = train_test_split(X_new.values, y_new.values, test_size = 0.25, random_state = 0)

In [37]:
#  X_train, X_test, y_train, y_test = train_test_split(X_data, y_new.values, test_size = 0.25, random_state = 0)

### Applying `StandardScalar`

In [38]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Train Logistic Regression Model

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
log_reg = LogisticRegression(class_weight='balanced', random_state=0,
                             solver='liblinear', penalty='l1')

In [None]:
log_reg.fit(X_train, y_train.ravel())

In [None]:
# coeff_df = pd.DataFrame(X_new.columns, log_reg.coef_, columns=['Coefficient'])
coeff_df = pd.DataFrame(log_reg.coef_[0], X_new.columns, columns=['Coefficient'])
coeff_df

In [None]:
y_pred = log_reg.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, auc
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
print("CONFUSION MATRIX \n", cm)
acc = accuracy_score(y_test, y_pred)
print("ACCURACY :", round(acc, 2))
precision = precision_score(y_test, y_pred)
print("PRECISION :", round(precision, 2))
recall = recall_score(y_test, y_pred)
print("RECALL :", round(recall, 2))
fpr, tpr, thresh = roc_curve(y_test, y_pred)
auc_score = auc(fpr, tpr)
print("AUC : ", round(auc_score, 2))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))