In [1]:
import tensorflow as tf
import sklearn
import pandas as pd
import numpy as np

# A. Process the data
- The dataset we'll be using is Bank transactions for fraud detection.
- First, let's generate a numpy array of our data.

- Because the dataset is so large, please download the csv file yourself and change the filepath appropriately.
- Link: https://www.kaggle.com/datasets/kornilovag94/payment-systems-transactions-synthetic-dataset

In [2]:
df = pd.read_csv('../datasets/Bank_Transactions.csv',delimiter=',')

In [3]:
df.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [4]:
targets = df.isFraud

In [5]:
df = df.drop(columns=["isFraud","isFlaggedFraud"])

In [6]:
def getHeader(x):
    return x[0]
def getBody(x):
    return x[1:]
#Split first character of nameOrig and nameDest column (name header)
nameOrig_header = df["nameOrig"].apply(getHeader)
nameOrig_body = df["nameOrig"].apply(getBody)

nameDest_header = df["nameDest"].apply(getHeader)
nameDest_body = df["nameDest"].apply(getBody)

In [7]:
#Insert the new columns to dataframe
df.insert(3,"nameOrig_header",nameOrig_header)
df.insert(4, "nameOrig_body", nameOrig_body)

df.insert(8, "nameDest_header", nameDest_header)
df.insert(9, "nameDest_body", nameDest_body)

In [8]:
df = df.drop(columns=["nameOrig", "nameDest"])

In [9]:
df.head()

Unnamed: 0,step,type,amount,nameOrig_header,nameOrig_body,oldbalanceOrg,newbalanceOrig,nameDest_header,nameDest_body,oldbalanceDest,newbalanceDest
0,1,PAYMENT,9839.64,C,1231006815,170136.0,160296.36,M,1979787155,0.0,0.0
1,1,PAYMENT,1864.28,C,1666544295,21249.0,19384.72,M,2044282225,0.0,0.0
2,1,TRANSFER,181.0,C,1305486145,181.0,0.0,C,553264065,0.0,0.0
3,1,CASH_OUT,181.0,C,840083671,181.0,0.0,C,38997010,21182.0,0.0
4,1,PAYMENT,11668.14,C,2048537720,41554.0,29885.86,M,1230701703,0.0,0.0


## Ordinal Encoding - type

In [10]:
from sklearn.preprocessing import OrdinalEncoder

type_encoder = OrdinalEncoder()

type_col = df.type.to_numpy().reshape(-1, 1)

enc_type = type_encoder.fit_transform(type_col)

In [11]:
df.type = pd.DataFrame(enc_type)

In [12]:
df.head()

Unnamed: 0,step,type,amount,nameOrig_header,nameOrig_body,oldbalanceOrg,newbalanceOrig,nameDest_header,nameDest_body,oldbalanceDest,newbalanceDest
0,1,3.0,9839.64,C,1231006815,170136.0,160296.36,M,1979787155,0.0,0.0
1,1,3.0,1864.28,C,1666544295,21249.0,19384.72,M,2044282225,0.0,0.0
2,1,4.0,181.0,C,1305486145,181.0,0.0,C,553264065,0.0,0.0
3,1,1.0,181.0,C,840083671,181.0,0.0,C,38997010,21182.0,0.0
4,1,3.0,11668.14,C,2048537720,41554.0,29885.86,M,1230701703,0.0,0.0


- We'll perform the same ordinal encoding for nameOrig_header and nameDest_header

In [13]:
#nameOrig
nameOrig_header_encoder = OrdinalEncoder()
nameOrig_header = nameOrig_header.to_numpy().reshape(-1, 1)
enc_nameOrig_header = nameOrig_header_encoder.fit_transform(nameOrig_header)

#nameDest
nameDest_header_encoder = OrdinalEncoder()
nameDest_header = nameDest_header.to_numpy().reshape(-1, 1)
enc_nameDest_header = nameDest_header_encoder.fit_transform(nameDest_header)

In [14]:
df.nameOrig_header = enc_nameOrig_header

df.nameDest_header = enc_nameDest_header

In [15]:
df.head()

Unnamed: 0,step,type,amount,nameOrig_header,nameOrig_body,oldbalanceOrg,newbalanceOrig,nameDest_header,nameDest_body,oldbalanceDest,newbalanceDest
0,1,3.0,9839.64,0.0,1231006815,170136.0,160296.36,1.0,1979787155,0.0,0.0
1,1,3.0,1864.28,0.0,1666544295,21249.0,19384.72,1.0,2044282225,0.0,0.0
2,1,4.0,181.0,0.0,1305486145,181.0,0.0,0.0,553264065,0.0,0.0
3,1,1.0,181.0,0.0,840083671,181.0,0.0,0.0,38997010,21182.0,0.0
4,1,3.0,11668.14,0.0,2048537720,41554.0,29885.86,1.0,1230701703,0.0,0.0


## Finally, let's transform Pandas dataframe to Numpy array and set dtype to float64

In [16]:
ds = df.to_numpy().astype(np.float64)

In [17]:
ds[:5]

array([[1.00000000e+00, 3.00000000e+00, 9.83964000e+03, 0.00000000e+00,
        1.23100682e+09, 1.70136000e+05, 1.60296360e+05, 1.00000000e+00,
        1.97978716e+09, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 3.00000000e+00, 1.86428000e+03, 0.00000000e+00,
        1.66654430e+09, 2.12490000e+04, 1.93847200e+04, 1.00000000e+00,
        2.04428222e+09, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 4.00000000e+00, 1.81000000e+02, 0.00000000e+00,
        1.30548614e+09, 1.81000000e+02, 0.00000000e+00, 0.00000000e+00,
        5.53264065e+08, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.81000000e+02, 0.00000000e+00,
        8.40083671e+08, 1.81000000e+02, 0.00000000e+00, 0.00000000e+00,
        3.89970100e+07, 2.11820000e+04, 0.00000000e+00],
       [1.00000000e+00, 3.00000000e+00, 1.16681400e+04, 0.00000000e+00,
        2.04853772e+09, 4.15540000e+04, 2.98858600e+04, 1.00000000e+00,
        1.23070170e+09, 0.00000000e+00, 0.00000000e+

In [18]:
targets = targets.astype(np.float64)

In [19]:
from sklearn.model_selection import train_test_split

ds_train, ds_test, targets_train, targets_test = train_test_split(ds, targets, test_size=0.3, random_state=0)

# B. Build the model

## Normalizer
- Since values are so numerically distanced, let's create a normalizer layer to our model.

In [24]:
normalizer = tf.keras.layers.Normalization(axis=-1)

#This might take a while (~2-3mins)
normalizer.adapt(ds_train)

print(normalizer.mean.numpy())

[[2.43362061e+02 1.71315467e+00 1.79942734e+05 0.00000000e+00
  1.07343712e+09 8.35506750e+05 8.56755250e+05 3.37934792e-01
  1.07272205e+09 1.10118125e+06 1.22575388e+06]]


## Model

In [25]:
linear_model = tf.keras.Sequential([normalizer,
                                    tf.keras.layers.Dense(16, activation='relu'),
                                    tf.keras.layers.Dense(1)
])

- Let's compile and train our model.

In [26]:
linear_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss=tf.keras.losses.BinaryCrossentropy()
)

linear_model.fit(
    ds_train,
    targets_train,
    epochs=5,
    validation_split=0.2
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x269e2a3ef50>

In [27]:
results = linear_model.evaluate(ds_test, targets_test, batch_size=128)




In [28]:
results

0.019548460841178894

In [30]:
predictions = linear_model.predict(ds_test[:100])

targets_p = targets_test[:100]

