In [133]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

from dotenv import load_dotenv
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score


# Load finance data from SQL

In [134]:
env_path = os.path.abspath("../.env")
load_dotenv(dotenv_path=env_path)

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

engine = create_engine(
    f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
)

finance = pd.read_sql("SELECT * FROM feature.finance", engine)
marketing = pd.read_sql("SELECT * FROM feature.marketing", engine)
feature_marketing = pd.read_sql("SELECT * FROM feature.marketing", engine)
feature_finance = pd.read_sql("SELECT * FROM feature.finance", engine)
marketing_score = pd.read_sql("SELECT * FROM marketing_scores", engine)
feature_finance_fraud = pd.read_sql("SELECT * FROM feature.finance_fraud_features", engine)
feature_marketing_fraud = pd.read_sql("SELECT * FROM feature.marketing_fraud_features", engine)
feature_fraud = pd.read_sql("SELECT * FROM feature.feature_fraud", engine)
label_fraud = pd.read_sql("SELECT * FROM label.fraud_label", engine)
feature_fraud_dataset = pd.read_sql("SELECT * FROM feature.training_fraud_dataset", engine)
finance_fraud_daily = pd.read_sql("SELECT * FROM feature.finance_fraud_daily", engine)
marketing_fraud_daily = pd.read_sql("SELECT * FROM feature.marketing_fraud_daily", engine)
feature_fraud_daily = pd.read_sql("SELECT * FROM feature.feature_fraud_daily", engine)
fraud_label_daily = pd.read_sql("SELECT * FROM label.fraud_label_daily", engine)
training_fraud_daily = pd.read_sql("SELECT * FROM feature.training_fraud_daily", engine)
daily_fraud_alert = pd.read_sql("SELECT * FROM alert.daily_fraud_alert", engine)
pd.set_option('display.max_columns', None)

In [138]:
# main dataset
df = feature_fraud_dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user_id              41188 non-null  object 
 1   event_date           41188 non-null  object 
 2   tx_count             41188 non-null  int64  
 3   total_tx_amount      41188 non-null  float64
 4   avg_tx_amount        41188 non-null  float64
 5   max_tx_amount        41188 non-null  float64
 6   std_tx_amount        41188 non-null  float64
 7   avg_account_balance  41188 non-null  float64
 8   total_clicks         41188 non-null  int64  
 9   total_impressions    41188 non-null  int64  
 10  total_conversion     41188 non-null  int64  
 11  ctr                  41188 non-null  float64
 12  weekday              41188 non-null  float64
 13  month                41188 non-null  float64
 14  year                 41188 non-null  float64
 15  fraud_label          41188 non-null 

In [136]:
feature_fraud_dataset

Unnamed: 0,user_id,event_date,tx_count,total_tx_amount,avg_tx_amount,max_tx_amount,std_tx_amount,avg_account_balance,total_clicks,total_impressions,total_conversion,ctr,weekday,month,year,fraud_label
0,AA13249,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,222,1,0,222.000000,4.0,12.0,2025.0,1
1,AA16259,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,237,1,0,237.000000,4.0,12.0,2025.0,1
2,AA35121,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,133,1,0,133.000000,4.0,12.0,2025.0,1
3,AA23441,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,167,7,0,23.857143,4.0,12.0,2025.0,1
4,AA19754,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,243,1,0,243.000000,4.0,12.0,2025.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,AA8156,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,73,1,0,73.000000,4.0,12.0,2025.0,1
41184,AA22986,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,153,5,0,30.600000,4.0,12.0,2025.0,1
41185,AA19073,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,196,1,0,196.000000,4.0,12.0,2025.0,1
41186,AA37743,2025-12-25,464,234972720.0,506406.724138,4963146.0,1.040360e+06,575492.148707,64,1,0,64.000000,4.0,12.0,2025.0,1


In [139]:
# Feature engineering for datetime
df['event_date'] = pd.to_datetime(df['event_date'])
df['Day'] = df['event_date'].dt.day

In [141]:
df.isnull().sum()

user_id                0
event_date             0
tx_count               0
total_tx_amount        0
avg_tx_amount          0
max_tx_amount          0
std_tx_amount          0
avg_account_balance    0
total_clicks           0
total_impressions      0
total_conversion       0
ctr                    0
weekday                0
month                  0
year                   0
fraud_label            0
Day                    0
dtype: int64

# Feature Target

In [None]:
TARGET = "fraud_label"

X = df.drop(columns=[TARGET, "fraud_reason"])
y = df[TARGET]

# Target split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scalling data

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modeling

In [None]:
model = models.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 64)                1280      
                                                                 
 dense_25 (Dense)            (None, 32)                2080      
                                                                 
 dense_26 (Dense)            (None, 1)                 33        
                                                                 
Total params: 3393 (13.25 KB)
Trainable params: 3393 (13.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 64)                1280      
                                                                 
 dense_25 (Dense)   

# Training model

In [None]:
history = model.fit(
    X_train_scaled,
    y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 2/50
Epoch 3/50
Epoch 3/50
Epoch 4/50
Epoch 4/50
Epoch 5/50
Epoch 5/50
Epoch 6/50
Epoch 6/50
Epoch 7/50
Epoch 7/50
Epoch 8/50
Epoch 8/50
Epoch 9/50
Epoch 9/50
Epoch 10/50
Epoch 10/50
Epoch 11/50
Epoch 11/50
Epoch 12/50
Epoch 12/50
Epoch 13/50
Epoch 13/50
Epoch 14/50
Epoch 14/50
Epoch 15/50
Epoch 15/50
Epoch 16/50
Epoch 16/50
Epoch 17/50
Epoch 17/50
Epoch 18/50
Epoch 18/50
Epoch 19/50
Epoch 19/50
Epoch 20/50
Epoch 20/50
Epoch 21/50
Epoch 21/50
Epoch 22/50
Epoch 22/50
Epoch 23/50
Epoch 23/50
Epoch 24/50
Epoch 24/50
Epoch 25/50
Epoch 25/50
Epoch 26/50
Epoch 26/50
Epoch 27/50
Epoch 27/50
Epoch 28/50
Epoch 28/50
Epoch 29/50
Epoch 29/50
Epoch 30/50
Epoch 30/50
Epoch 31/50
Epoch 31/50
Epoch 32/50
Epoch 32/50
Epoch 33/50
Epoch 33/50
Epoch 34/50
Epoch 34/50
Epoch 35/50
Epoch 35/50
Epoch 36/50
Epoch 36/50
Epoch 37/50
Epoch 37/50
Epoch 38/50
Epoch 38/50
Epoch 39/50
Epoch 39/50
Epoch 40/50
Epoch 40/50
Epoch 41/50
Epoch 41/50
Epoch 42/50
Epoch 42/50
Epoch 43/50
Epoch 43/

# Evaluation

In [None]:
y_pred_prob = model.predict(X_test_scaled).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print(classification_report(y_test, y_pred))

# ROC AUC only can be calculated using predicted probabilities
if len(np.unique(y_test)) >1:
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))
else:
    print("ROC AUC Score: Cannot be calculated, only one class present in y_test.")
    print(f"Test set class distribution: {np.unique(y_test, return_counts=True)}")


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

ROC AUC Score: Cannot be calculated, only one class present in y_test.
Test set class distribution: (array([0]), array([1]))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

ROC AUC Score: Cannot be calculated, only one class present in y_test.
Test set class distribution: (array([0]), array([1]))
