# Fraud Modeling with TensorFlow Neural Network

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import sys, os, time, warnings, pdb, pickle, random, math, re, json
warnings.filterwarnings('ignore')
sys.path.insert(0, '../scripts')

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, CategoryEncoding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

GLOBAL_SEED=42
np.set_printoptions(precision=4)
sns.set_style("darkgrid")
pd.set_option('display.float_format', '{:.2f}'.format)
%matplotlib inline

2024-10-03 11:37:41.977689: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-03 11:37:42.002021: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-03 11:37:42.008554: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-03 11:37:42.025211: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_dir = Path('../models/tf')

In [3]:
day_map = {k:v for k,v in zip(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])}

In [4]:
read_cols = ['name_enc', 'gender_enc', 'age_at_trans_norm', 'job_enc', 'cc_num_enc', 'merchant_enc', 'category_enc', 'merch_lat_norm', 'merch_long_norm', 'city_pop_norm', 'lat_norm', 'long_norm', 'hour', 'day_of_week', 'week_of_year', 'month', 'amt_norm', 'is_fraud']

## Load

In [5]:
train_df = pd.read_csv('../data/processed_train.csv', usecols=read_cols)
val_df = pd.read_csv('../data/processed_val.csv', usecols=read_cols)
test_df = pd.read_csv('../data/processed_test.csv', usecols=read_cols)

train_df = train_df[read_cols]
val_df = val_df[read_cols]
test_df = test_df[read_cols]

In [6]:
features = [col for col in train_df.columns if col != 'is_fraud']

X_train, y_train = train_df[features], train_df['is_fraud']
X_val, y_val = val_df[features], val_df['is_fraud']
X_test, y_test = test_df[features], test_df['is_fraud']

## Model Training

In [7]:
input_name = Input(shape=(1,))
input_gender = Input(shape=(1,))
input_age = Input(shape=(1,))
input_job = Input(shape=(1,))
input_cc_num = Input(shape=(1,))
input_merchant = Input(shape=(1,))
input_category = Input(shape=(1,))
input_merch_lat = Input(shape=(1,))
input_merch_long = Input(shape=(1,))
input_city_pop = Input(shape=(1,))
input_lat = Input(shape=(1,))
input_long = Input(shape=(1,))
input_hour = Input(shape=(1,))
input_dow = Input(shape=(1,))
input_woy = Input(shape=(1,))
input_month = Input(shape=(1,))
input_amt = Input(shape=(1,))

In [8]:
embedding_size = 4

embedding_cc_num = Embedding(input_dim=np.max(X_train['cc_num_enc']) + 1, output_dim=embedding_size)(input_cc_num)
flatten_cc_num = Flatten()(embedding_cc_num)

embedding_merchant = Embedding(input_dim=np.max(X_train['merchant_enc']) + 1, output_dim=embedding_size)(input_merchant)
flatten_merchant = Flatten()(embedding_merchant)

embedding_job = Embedding(input_dim=np.max(X_train['job_enc']) + 1, output_dim=embedding_size)(input_job)
flatten_job = Flatten()(embedding_job)

embedding_name = Embedding(input_dim=np.max(X_train['name_enc']) + 1, output_dim=embedding_size)(input_job)
flatten_name = Flatten()(embedding_name)

I0000 00:00:1727969873.479319    2748 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727969873.528289    2748 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727969873.528445    2748 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727969873.534930    2748 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727969873.535168    2748 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [9]:
one_hot_category = CategoryEncoding(num_tokens=X_train['category_enc'].nunique(), output_mode="one_hot")(input_category)

In [10]:
# Concatenate all features
concatenated = Concatenate()([
    flatten_name,
    input_gender,
    input_age,
    flatten_job,
    flatten_cc_num,
    flatten_merchant,
    one_hot_category,
    input_merch_lat,
    input_merch_long,
    input_city_pop,
    input_lat,
    input_long,
    input_hour,    
    input_dow,
    input_woy,
    input_month, 
    input_amt,
])

In [11]:
# Hidden layers
dense_1 = Dense(256, activation='relu')(concatenated)
dense_2 = Dense(256, activation='relu')(dense_1)
dropout1 = Dropout(0.5)(dense_1)
dense_3 = Dense(64, activation='relu')(dropout1)
dropout2 = Dropout(0.5)(dense_3)
output = Dense(1, activation='sigmoid')(dropout2)

In [12]:
# Build and compile model
model = Model(inputs=[
    input_name,
    input_gender,
    input_age,
    input_job,
    input_cc_num,
    input_merchant,
    input_category,
    input_merch_lat,
    input_merch_long,
    input_city_pop,
    input_lat,
    input_long,
    input_hour,    
    input_dow,
    input_woy,
    input_month, 
    input_amt,
], outputs=output)

In [13]:
METRICS = [
      metrics.BinaryCrossentropy(name='cross entropy'),  # same as model's loss
      metrics.MeanSquaredError(name='Brier score'),
      metrics.TruePositives(name='tp'),
      metrics.FalsePositives(name='fp'),
      metrics.TrueNegatives(name='tn'),
      metrics.FalseNegatives(name='fn'),
      metrics.BinaryAccuracy(name='accuracy'),
      metrics.Precision(name='precision'),
      metrics.Recall(name='recall'),
      metrics.AUC(name='auc'),
      metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [14]:
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=METRICS)
initial_weights = model.get_weights()
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(weights))
early_stopping = EarlyStopping(monitor='val_auc', patience=15, restore_best_weights=True)

In [15]:
history = model.fit(
    [
        X_train['name_enc'],
        X_train['gender_enc'],
        X_train['age_at_trans_norm'],
        X_train['job_enc'],
        X_train['cc_num_enc'],
        X_train['merchant_enc'],
        X_train['category_enc'],
        X_train['merch_lat_norm'],
        X_train['merch_long_norm'],
        X_train['city_pop_norm'],
        X_train['lat_norm'],
        X_train['long_norm'],
        X_train['hour'], 
        X_train['day_of_week'], 
        X_train['week_of_year'], 
        X_train['month'],
        X_train['amt_norm'],        
    ],
    np.expand_dims(y_train.values, -1),
    validation_data=[
        [
            X_val['name_enc'],
            X_val['gender_enc'],
            X_val['age_at_trans_norm'],
            X_val['job_enc'],
            X_val['cc_num_enc'],
            X_val['merchant_enc'],
            X_val['category_enc'],
            X_val['merch_lat_norm'],
            X_val['merch_long_norm'],
            X_val['city_pop_norm'],
            X_val['lat_norm'],
            X_val['long_norm'],
            X_val['hour'], 
            X_val['day_of_week'], 
            X_val['week_of_year'], 
            X_val['month'],
            X_val['amt_norm']
        ],
        np.expand_dims(y_val.values, -1)
    ],
    epochs=500,
    batch_size=128,
    class_weight=class_weights,
    callbacks=[early_stopping]
)

Epoch 1/500


I0000 00:00:1727969890.154275    2893 service.cc:146] XLA service 0x7fd398004920 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1727969890.154359    2893 service.cc:154]   StreamExecutor device (0): Quadro RTX 5000 with Max-Q Design, Compute Capability 7.5
2024-10-03 11:38:10.690619: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-03 11:38:11.545970: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m   9/9841[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:19[0m 14ms/step - Brier score: 0.1467 - accuracy: 0.8158 - auc: 0.5942 - cross entropy: 0.7819 - fn: 3.7778 - fp: 73.0000 - loss: 10.9498 - prc: 0.0154 - precision: 0.0135 - recall: 0.3685 - tn: 562.2222 - tp: 1.0000    

I0000 00:00:1727969898.681709    2893 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m9841/9841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 22ms/step - Brier score: 0.2502 - accuracy: 0.4648 - auc: 0.4994 - cross entropy: 0.6918 - fn: 1620.7804 - fp: 331403.0625 - loss: 0.9226 - prc: 0.0055 - precision: 0.0055 - recall: 0.5393 - tn: 295096.8438 - tp: 1831.3080 - val_Brier score: 0.2666 - val_accuracy: 0.0055 - val_auc: 0.5000 - val_cross entropy: 0.7263 - val_fn: 0.0000e+00 - val_fp: 313162.0000 - val_loss: 0.7263 - val_prc: 0.0055 - val_precision: 0.0055 - val_recall: 1.0000 - val_tn: 0.0000e+00 - val_tp: 1745.0000
Epoch 2/500
[1m9841/9841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 19ms/step - Brier score: 0.2640 - accuracy: 0.5119 - auc: 0.4951 - cross entropy: 0.7669 - fn: 1903.4172 - fp: 284417.0312 - loss: 1.2445 - prc: 0.0054 - precision: 0.0054 - recall: 0.4835 - tn: 342074.5000 - tp: 1557.0326 - val_Brier score: 0.2762 - val_accuracy: 0.0055 - val_auc: 0.5000 - val_cross entropy: 0.7456 - val_fn: 0.0000e+00 - val_fp: 313162.0000 - v

In [16]:
y_pred_binary = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred_binary))
print(classification_report(y_test, y_pred_binary))
print(f"AUC-ROC: {roc_auc_score(y_test, y_pred)}")

NameError: name 'y_pred' is not defined

## Model Testing

In [None]:
model = lgb.Booster(model_file=model_dir/'model.txt')
y_pred = model.predict(X_test)
auc_score = roc_auc_score(y_test, y_pred)
y_pred_score = model.predict(X_test)
y_pred = (y_pred_score > 0.5).astype(int)
auc = roc_auc_score(y_test, y_pred_score)

report = classification_report(y_test, y_pred)

print(f"AUC on test set: {auc:0.3f}")
print("\nClassification Report:")
print(report)