In [48]:
#import libraries
from sys import modules
from os import listdir
import pandas as pd
import numpy as np
if not "seaborn" in modules:
    %pip install seaborn
import seaborn as sns
if not "matplotlib" in modules:
    %pip install matplotlib
import matplotlib as plt
if not "xgboost" in modules:
    %pip install xgboost
from xgboost import XGBClassifier
if not "sklearn" in modules:
    %pip install sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
if not "kaggle" in modules:
    %pip install kaggle
import kaggle

# random state to be used for the whole program
random_state = 3

In [36]:
# set up kaggle API access
# make sure to upload your API key, which should be in a file called "kaggle.json"
if "kaggle.json" in listdir():
    if not ".kaggle" in listdir():
        %mkdir ~/.kaggle/
    if not ".kaggle" in listdir(".kaggle"):
        %mv kaggle.json ~/.kaggle/

# copy this into the terminal
# chmod 600 ~/.kaggle/kaggle.json

UsageError: Line magic function `%chmod` not found.


You can download the dataset [here](https://www.kaggle.com/code/arjunjoshua/predicting-fraud-in-financial-payment-services/input).

In [46]:
df = pd.read_csv("PS_20174392719_1491204439457_log.csv")

In [7]:
df.shape
# (6362620, 11)

(6362620, 11)

In [None]:
# see if anything pops out with basic stats
df.describe()

In [9]:
# look at data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [None]:
df.columns
# step
# Maps a unit of time in the real world. In this case 1 step is 1 hour of time.

# type
# CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER

# amount
# amount of the transaction in local currency

# nameOrig
# customer who started the transaction

# oldbalanceOrg
# initial balance before the transaction

# newbalanceOrg
# customer's balance after the transaction.

# nameDest
# recipient ID of the transaction.

# oldbalanceDest
# initial recipient balance before the transaction.

# newbalanceDest
# recipient's balance after the transaction.

# isFraud
# identifies a fraudulent transaction (1) and non fraudulent (0)

# isFlaggedFraud
# flags illegal attempts to transfer more than 200.000 in a single transaction.

In [37]:
# find out how much fraud occurs
n_frauds = df[df["isFraud"] == 1].shape[0]
print(f'{n_frauds} frauds out of {df.shape[0]}')
print(f"{n_frauds/df.shape[0]} %")
# 8213 frauds out of 6362620
# 0.001290820448180152 %

8213 frauds out of 6362620
0.001290820448180152 %


In [44]:
# find out the success rate for flagging transactions as fraud
mask1 = (df["isFlaggedFraud"] == 1) & (df["isFraud"] == 1)
correct_fraud_flags = df[mask1].shape[0]
mask2 = (df["isFlaggedFraud"] == 0) & (df["isFraud"] == 1)
incorrect_fraud_flags = df[mask2].shape[0]
print(f"                    frauds: {n_frauds}")
print(f"  frauds flagged correctly: {correct_fraud_flags}")
print(f"frauds flagged incorrectly: {incorrect_fraud_flags}")
fraud_flag_success_rate = 100 * correct_fraud_flags / n_frauds
print(f"   fraud flag success rate: {round(fraud_flag_success_rate, 3)}%")

                    frauds: 8213
  frauds flagged correctly: 16
frauds flagged incorrectly: 8197
   fraud flag success rate: 0.195%


In [None]:
# see if any columns should be converted to numerical values
for column in df.columns:
    print(column, df.dtypes[column], df[column].unique()[:5])

In [None]:
# see if anything pops out correlation-wise
corr = df.corr()
# corr
# ax = sns.heatmap(
#     corr, 
#     vmin=-1, vmax=1, center=0,
#     cmap=sns.diverging_palette(20, 220, n=200),
#     square=True
# )
# ax.set_xticklabels(
#     ax.get_xticklabels(),
#     rotation=45,
#     horizontalalignment='right'
# );

In [18]:
# use xg boost
X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns.drop(["isFraud", "isFlaggedFraud"])],
    df["isFraud"],
    test_size=.25,
    random_state=random_state
)

In [53]:
parameter_grid = {
    "n_estimators": [50, 100, 150],
    "max_depth": [10, 15, 20],
    "grow_policy": [0, 1],
    "learning_rate": [0.1, 0.05, 0.01],
    "booster": ["gbtree", "gblinear", "dart"],
    "n_jobs": [-1],
    "reg_alpha": [0.1, 0.05, 0.01],
    "reg_lambda": [0.1, 0.05, 0.01],
    "random_state": [random_state]
}
permutations = 1
for k in parameter_grid:
    permutations *= len(parameter_grid[k])
print(permutations)
    

1458


In [58]:
xgb = XGBClassifier()

In [59]:
randomized_search_CV = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=parameter_grid,
    random_state=random_state,
    cv=3,
    n_iter=10,
    n_jobs=-1
)

In [60]:
randomized_search_CV.fit(
    X=X_train,
    y=y_train
)