In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [52]:
df = pd.read_csv('D:/Projects/FinanceForgery/Fraud.csv')

df['balance_change_org'] = df['newbalanceOrig'] - df['oldbalanceOrg']
df['balance_change_dest'] = df['newbalanceDest'] - df['oldbalanceDest']
df['transaction_amount'] = df['amount']
#Ran out of memory inially, changing to float32
df = df.astype({'amount': 'float32', 'oldbalanceOrg': 'float32', 'newbalanceOrig': 'float32',
                 'oldbalanceDest': 'float32', 'newbalanceDest': 'float32', 'balance_change_org': 'float32',
                 'balance_change_dest': 'float32', 'transaction_amount': 'float32'})

df = pd.get_dummies(df, columns=['type'], drop_first=True)
df = df.drop(columns=['nameOrig', 'nameDest'])

df.head

<bound method NDFrame.head of          step        amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0           1  9.839640e+03     170136.000   160296.359375    0.000000e+00   
1           1  1.864280e+03      21249.000    19384.720703    0.000000e+00   
2           1  1.810000e+02        181.000        0.000000    0.000000e+00   
3           1  1.810000e+02        181.000        0.000000    2.118200e+04   
4           1  1.166814e+04      41554.000    29885.859375    0.000000e+00   
...       ...           ...            ...             ...             ...   
6362615   743  3.396821e+05     339682.125        0.000000    0.000000e+00   
6362616   743  6.311410e+06    6311409.500        0.000000    0.000000e+00   
6362617   743  6.311410e+06    6311409.500        0.000000    6.848884e+04   
6362618   743  8.500025e+05     850002.500        0.000000    0.000000e+00   
6362619   743  8.500025e+05     850002.500        0.000000    6.510099e+06   

         newbalanceDest  isFraud 

In [54]:
X = df.drop(columns=['isFraud', 'isFlaggedFraud'])
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [56]:
import xgboost as xgb
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=10,
    tree_method='hist',
    device='cuda',
    random_state=42
)

In [60]:
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

In [62]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix)

roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC: {roc_auc:.4f}")


Accuracy: 0.9997
Precision: 0.8307
Recall: 0.9359
F1 Score: 0.8802
Confusion Matrix:
 [[1588220     390]
 [    131    1914]]
AUC: 0.9988
