In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import sys, os, time, warnings, pdb, pickle, random, math, re, json
warnings.filterwarnings('ignore')
sys.path.insert(0, '../scripts')

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, CategoryEncoding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import metrics
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
pd.set_option('display.float_format', '{:.2f}'.format)
%matplotlib inline

2024-09-04 15:19:21.495423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-04 15:19:21.662531: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-04 15:19:21.662594: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-04 15:19:21.861050: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')

In [3]:
df['diff_orig'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['diff_dest'] = df['oldbalanceDest'] - df['newbalanceDest']
df['amount_percentage'] = df['amount'] / (df['oldbalanceOrg'] + 1e-9)
df['step'] = df['step'] % 24

# Initialize LabelEncoder for nameOrig, nameDest, and type
label_encoder_orig = LabelEncoder()
label_encoder_dest = LabelEncoder()
label_encoder_type = LabelEncoder()

# Fit the encoder and transform the nameOrig and nameDest columns
df['nameOrig'] = label_encoder_orig.fit_transform(df['nameOrig'])
df['nameDest'] = label_encoder_dest.fit_transform(df['nameDest'])
df['type'] = label_encoder_type.fit_transform(df['type'])

# Check the result
print(df[['nameOrig', 'nameDest']].head())

# Train-test split
y = df['isFraud']
X = df.drop(['isFraud', 'isFlaggedFraud'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'diff_orig', 'diff_dest', 'amount_percentage']] = scaler.fit_transform(
    X_train[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'diff_orig', 'diff_dest', 'amount_percentage']])
X_test[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'diff_orig', 'diff_dest', 'amount_percentage']] = scaler.transform(
    X_test[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'diff_orig', 'diff_dest', 'amount_percentage']])

   nameOrig  nameDest
0    757869   1662094
1   2188998   1733924
2   1002156    439685
3   5828262    391696
4   3445981    828919


In [4]:
# Model with embedding layers for nameOrig and nameDest
input_step = Input(shape=(1,))
input_amount = Input(shape=(1,))
input_oldbalanceOrg = Input(shape=(1,))
input_newbalanceOrig = Input(shape=(1,))
input_oldbalanceDest = Input(shape=(1,))
input_newbalanceDest = Input(shape=(1,))
input_diff_orig = Input(shape=(1,))
input_diff_dest = Input(shape=(1,))
input_amount_percentage = Input(shape=(1,))
input_type = Input(shape=(1,))
input_nameOrig = Input(shape=(1,))
input_nameDest = Input(shape=(1,))

# Embedding layers for nameOrig and nameDest
embedding_size = 16

embedding_nameOrig = Embedding(input_dim=np.max(X_train['nameOrig']) + 1, output_dim=embedding_size)(input_nameOrig)
embedding_nameDest = Embedding(input_dim=np.max(X_train['nameDest']) + 1, output_dim=embedding_size)(input_nameDest)

one_hot_type = CategoryEncoding(num_tokens=5, output_mode="one_hot")(input_type)

# Flatten embedding layers
flatten_nameOrig = Flatten()(embedding_nameOrig)
flatten_nameDest = Flatten()(embedding_nameDest)

# Concatenate all features
concatenated = Concatenate()([
    input_step,
    input_amount, 
    input_oldbalanceOrg, 
    input_newbalanceOrig, 
    input_oldbalanceDest, 
    input_newbalanceDest, 
    input_diff_orig, 
    input_diff_dest, 
    input_amount_percentage, 
    one_hot_type, 
    flatten_nameOrig, 
    flatten_nameDest
])

# Hidden layers
dense_1 = Dense(256, activation='relu')(concatenated)
dense_2 = Dense(256, activation='relu')(dense_1)
dropout1 = Dropout(0.5)(dense_1)
dense_3 = Dense(64, activation='relu')(dropout1)
dropout2 = Dropout(0.5)(dense_3)
output = Dense(1, activation='sigmoid')(dropout2)

# Build and compile model
model = Model(inputs=[
    input_step,
    input_amount, 
    input_oldbalanceOrg, 
    input_newbalanceOrig, 
    input_oldbalanceDest, 
    input_newbalanceDest, 
    input_diff_orig, 
    input_diff_dest, 
    input_amount_percentage, 
    input_type, 
    input_nameOrig, 
    input_nameDest
], outputs=output)

METRICS = [
      metrics.BinaryCrossentropy(name='cross entropy'),  # same as model's loss
      metrics.MeanSquaredError(name='Brier score'),
      metrics.TruePositives(name='tp'),
      metrics.FalsePositives(name='fp'),
      metrics.TrueNegatives(name='tn'),
      metrics.FalseNegatives(name='fn'),
      metrics.BinaryAccuracy(name='accuracy'),
      metrics.Precision(name='precision'),
      metrics.Recall(name='recall'),
      metrics.AUC(name='auc'),
      metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=METRICS)

# Class weights to handle imbalance
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(weights))

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    [X_train['step'],
     X_train['amount'], 
     X_train['oldbalanceOrg'], 
     X_train['newbalanceOrig'], 
     X_train['oldbalanceDest'], 
     X_train['newbalanceDest'], 
     X_train['diff_orig'], 
     X_train['diff_dest'], 
     X_train['amount_percentage'], 
     X_train['type'], 
     X_train['nameOrig'], 
     X_train['nameDest']], 
    np.expand_dims(y_train.values, -1), 
    validation_split=0.2, 
    epochs=50, 
    batch_size=8192*4, 
    class_weight=class_weights, 
    callbacks=[early_stopping]
)

# Evaluate the model
y_pred = model.predict([
    X_test['step'],
    X_test['amount'], 
    X_test['oldbalanceOrg'], 
    X_test['newbalanceOrig'], 
    X_test['oldbalanceDest'], 
    X_test['newbalanceDest'], 
    X_test['diff_orig'], 
    X_test['diff_dest'], 
    X_test['amount_percentage'], 
    X_test['type'], 
    X_test['nameOrig'], 
    X_test['nameDest']
])

y_pred_binary = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred_binary))
print(classification_report(y_test, y_pred_binary))
print(f"AUC-ROC: {roc_auc_score(y_test, y_pred)}")

2024-09-04 15:20:44.670899: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-04 15:20:44.671504: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-04 15:20:44.672037: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

<class 'pandas.core.series.Series'>
Index: 5090096 entries, 292779 to 1541412
Series name: isFraud
Non-Null Count    Dtype
--------------    -----
5090096 non-null  int64
dtypes: int64(1)
memory usage: 77.7 MB
None
Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 569ms/step - Brier score: 0.1551 - accuracy: 0.7866 - auc: 0.8333 - cross entropy: 0.4984 - fn: 615.3571 - fp: 355571.7500 - loss: 0.5418 - prc: 0.0786 - precision: 0.0052 - recall: 0.7440 - tn: 1721819.5000 - tp: 2121.5635 - val_Brier score: 0.0357 - val_accuracy: 0.9651 - val_auc: 0.9816 - val_cross entropy: 0.1454 - val_fn: 143.0000 - val_fp: 35357.0000 - val_loss: 0.1454 - val_prc: 0.4031 - val_precision: 0.0312 - val_recall: 0.8885 - val_tn: 981381.0000 - val_tp: 1139.0000
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 535ms/step - Brier score: 0.0544 - accuracy: 0.9309 - auc: 0.9751 - cross entropy: 0.1914 - fn: 225.0556 - fp: 139267.4375 - loss: 0.2033 - prc: 0