# Predicting financial fraud

First, let's import the necessary libraries.

In [2]:
from sys import modules
from os import listdir
import pandas as pd
import numpy as np
if not "seaborn" in modules:
    %pip install seaborn
import seaborn as sns
if not "matplotlib" in modules:
    %pip install matplotlib
import matplotlib as plt
if not "xgboost" in modules:
    %pip install xgboost
from xgboost import XGBClassifier
if not "sklearn" in modules:
    %pip install sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
if not "kaggle" in modules:
    %pip install kaggle
import kaggle

# random state to be used for the whole program
random_state = 3

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
# set up kaggle API access
# make sure to upload your API key, which should be in a file called "kaggle.json"
if "kaggle.json" in listdir():
    if not ".kaggle" in listdir():
        %mkdir ~/.kaggle/
    if not ".kaggle" in listdir(".kaggle"):
        %mv kaggle.json ~/.kaggle/

# copy this into the terminal
# chmod 600 ~/.kaggle/kaggle.json

You can download the dataset [here](https://www.kaggle.com/code/arjunjoshua/predicting-fraud-in-financial-payment-services/input).

In [4]:
df = pd.read_csv("PS_20174392719_1491204439457_log.csv")

In [5]:
df.shape
# (6362620, 11)

(6362620, 11)

In [6]:
# see if anything pops out with basic stats
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [7]:
# look at data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [8]:
df.columns
# step
# Maps a unit of time in the real world. In this case 1 step is 1 hour of time.

# type
# CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER

# amount
# amount of the transaction in local currency

# nameOrig
# customer who started the transaction

# oldbalanceOrg
# initial balance before the transaction

# newbalanceOrg
# customer's balance after the transaction.

# nameDest
# recipient ID of the transaction.

# oldbalanceDest
# initial recipient balance before the transaction.

# newbalanceDest
# recipient's balance after the transaction.

# isFraud
# identifies a fraudulent transaction (1) and non fraudulent (0)

# isFlaggedFraud
# flags illegal attempts to transfer more than 200.000 in a single transaction.

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [9]:
# find out how much fraud occurs
n_frauds = df[df["isFraud"] == 1].shape[0]
print(f'{n_frauds} frauds out of {df.shape[0]}')
print(f"{n_frauds/df.shape[0]} %")
# 8213 frauds out of 6362620
# 0.001290820448180152 %

8213 frauds out of 6362620
0.001290820448180152 %


In [10]:
# find out the success rate for flagging transactions as fraud
mask1 = (df["isFlaggedFraud"] == 1) & (df["isFraud"] == 1)
correct_fraud_flags = df[mask1].shape[0]
mask2 = (df["isFlaggedFraud"] == 0) & (df["isFraud"] == 1)
incorrect_fraud_flags = df[mask2].shape[0]
print(f"                    frauds: {n_frauds}")
print(f"  frauds flagged correctly: {correct_fraud_flags}")
print(f"frauds flagged incorrectly: {incorrect_fraud_flags}")
fraud_flag_success_rate = 100 * correct_fraud_flags / n_frauds
print(f"   fraud flag success rate: {round(fraud_flag_success_rate, 3)}%")

                    frauds: 8213
  frauds flagged correctly: 16
frauds flagged incorrectly: 8197
   fraud flag success rate: 0.195%


In [11]:
# see if any columns should be converted to numerical values
for column in df.columns:
    print(column, df.dtypes[column], df[column].unique()[:5])

step int64 [1 2 3 4 5]
type object ['PAYMENT' 'TRANSFER' 'CASH_OUT' 'DEBIT' 'CASH_IN']
amount float64 [ 9839.64  1864.28   181.   11668.14  7817.71]
nameOrig object ['C1231006815' 'C1666544295' 'C1305486145' 'C840083671' 'C2048537720']
oldbalanceOrg float64 [170136.  21249.    181.  41554.  53860.]
newbalanceOrig float64 [160296.36  19384.72      0.    29885.86  46042.29]
nameDest object ['M1979787155' 'M2044282225' 'C553264065' 'C38997010' 'M1230701703']
oldbalanceDest float64 [    0. 21182. 41898. 10845.  5083.]
newbalanceDest float64 [     0.    40348.79 157982.12  51513.44  16896.7 ]
isFraud int64 [0 1]
isFlaggedFraud int64 [0 1]


In [12]:
# see if anything pops out correlation-wise
corr = df.corr()
# corr
# ax = sns.heatmap(
#     corr, 
#     vmin=-1, vmax=1, center=0,
#     cmap=sns.diverging_palette(20, 220, n=200),
#     square=True
# )
# ax.set_xticklabels(
#     ax.get_xticklabels(),
#     rotation=45,
#     horizontalalignment='right'
# );

  corr = df.corr()


In [13]:
X = df[df.columns.drop(["isFraud", "isFlaggedFraud"])].select_dtypes(
    include=["int64", "float64"]
)
y = df["isFraud"]

In [14]:
X.sample(5)

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
3846334,282,78501.41,11021.0,0.0,387425.31,465926.72
2944073,230,56614.18,38813.0,0.0,3508031.07,3564645.25
5784444,400,40480.76,0.0,0.0,66636.52,107117.28
4608418,329,12143.62,20561.04,8417.43,0.0,0.0
47499,9,35311.34,0.0,0.0,0.0,0.0


In [30]:
# use xg boost
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=.25,
    random_state=random_state
)

In [35]:
parameter_grid_rscv = {
    "reg_lambda": [0.01],
    "reg_alpha": [0.1],
    "random_state": [3],
    "n_jobs": [-1],
    "n_estimators": [175, 200, 225],
    "learning_rate": [0.1, 0.08, 0.12],
    "booster": ["dart"],
}   

In [36]:
xgb = XGBClassifier()

In [37]:
randomized_search_CV = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=parameter_grid_rscv,
    random_state=random_state,
    cv=3,
    n_iter=10,
    n_jobs=-1
)

In [None]:
# run 1
# 151 minutes
# run 2
# 578 minutes
# run 3
# ___ minutes
rscv = randomized_search_CV.fit(
    X=X_train,
    y=y_train
)

In [None]:
#  run 1
# {'reg_lambda': 0.01,
# 'reg_alpha': 0.1,
# 'random_state': 3,
# 'n_jobs': -1,
# 'n_estimators': 150,
# 'learning_rate': 0.1,
# 'booster': 'dart'}
#  run 2
# {
#     "reg_lambda": 0.01,
#     "reg_alpha": 0.1,
#     "random_state": 3,
#     "n_jobs": -1,
#     "n_estimators": 175,
#     "learning_rate": 0.1,
#     "booster": dart,
# }
# run 3  
result = "{\n"
for k in rscv.best_params_:
    result += f"    \"{k}\": {rscv.best_params_[k]},\n"
result += "}"
print(result)


In [40]:
xgb_rscv = XGBClassifier(
    n_estimators=175,
    learning_rate=0.1,
    booster="dart",
    reg_alpha=0.1,
    reg_lambda=0.01,
    n_jobs=-1,
    random_state=random_state
)

In [40]:
# parameter_grid_gscv = {
#     "n_estimators": [125, 150, 175],
#     "learning_rate": [0.1, 0.12, 0.08],
#     "booster": ["gbtree", "gblinear", "dart"],
#     "n_jobs": [-1],
#     "reg_alpha": [0.1, 0.08, 0.12],
#     "reg_lambda": [0.012, 0.008, 0.01],
#     "random_state": [random_state]
# }

In [42]:
# grid_search_cv = GridSearchCV(
#     estimator=xgb,
#     param_grid=parameter_grid_gscv,
#     cv=3,
#     n_jobs=-1
# )

In [None]:
# gscv = grid_search_cv.fit(
#     X=X_train,
#     y=y_train
# )

In [None]:
# result = "{\n"
# for k in gscv.best_params_:
#     result += f"    \"{k}\": {rscv.best_params_[k]},\n"
# result += "}"
# print(result)

In [None]:
xgb_rscv.fit(X_train, y_train)

In [None]:
# predict fraud with XGBoost model
y_predictions = xgb_rscv.predict(X_test)