# Online Payments Fraud Detection

Industry-grade rewritten notebook.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import pickle

In [4]:
df = pd.read_csv(r'E:\IT\aimlsmartBridge\online_payments_fraud_detection\data\PS_20174392719_1491204439457_log.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.drop(columns=['nameOrig','nameDest','isFlaggedFraud'], inplace=True)

In [6]:
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

In [7]:
df['amount'] = np.log(df['amount'] + 1)

In [8]:
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [8]:
X

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,1,3,9839.64,170136.00,160296.36,0.00,0.00
1,1,3,1864.28,21249.00,19384.72,0.00,0.00
2,1,4,181.00,181.00,0.00,0.00,0.00
3,1,1,181.00,181.00,0.00,21182.00,0.00
4,1,3,11668.14,41554.00,29885.86,0.00,0.00
...,...,...,...,...,...,...,...
6362615,743,1,339682.13,339682.13,0.00,0.00,339682.13
6362616,743,4,6311409.28,6311409.28,0.00,0.00,0.00
6362617,743,1,6311409.28,6311409.28,0.00,68488.84,6379898.11
6362618,743,4,850002.52,850002.52,0.00,0.00,0.00


In [9]:
y

0          0
1          0
2          1
3          1
4          0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 6362620, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [11]:
model = DecisionTreeClassifier(max_depth=6, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [16]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

In [17]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1255622   15259]
 [     66    1577]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99   1270881
           1       0.09      0.96      0.17      1643

    accuracy                           0.99   1272524
   macro avg       0.55      0.97      0.58   1272524
weighted avg       1.00      0.99      0.99   1272524



In [18]:
print('ROC-AUC:', roc_auc_score(y_test, y_prob))

ROC-AUC: 0.994214560136226


In [22]:
print("Transaction types learned by encoder:")
print(le.classes_)

Transaction types learned by encoder:
['CASH_IN' 'CASH_OUT' 'DEBIT' 'PAYMENT' 'TRANSFER']


In [23]:
pickle.dump(model, open('fraud_model.pkl','wb'))
pickle.dump(le, open('type_encoder.pkl','wb'))

In [24]:
sample = pd.DataFrame([{
    "step": 45,
    "type": le.transform(["TRANSFER"])[0],
    "amount": np.log(9000 + 1),
    "oldbalanceOrg": 10000,
    "newbalanceOrig": 1000,
    "oldbalanceDest": 0,
    "newbalanceDest": 9000
}])

print("Prediction:", model.predict(sample))
print("Fraud Probability:", model.predict_proba(sample))

Prediction: [0]
Fraud Probability: [[1. 0.]]


In [25]:
print(model.feature_names_in_)

['step' 'type' 'amount' 'oldbalanceOrg' 'newbalanceOrig' 'oldbalanceDest'
 'newbalanceDest']


In [26]:
print("Model classes:", model.classes_)

Model classes: [0 1]


In [11]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    class_weight="balanced",
    n_jobs=1,          
    random_state=42
)

rfc.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,8
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
y_prob = rfc.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1248745   22136]
 [     17    1626]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1270881
           1       0.07      0.99      0.13      1643

    accuracy                           0.98   1272524
   macro avg       0.53      0.99      0.56   1272524
weighted avg       1.00      0.98      0.99   1272524



In [13]:
import pickle
pickle.dump(rfc, open("fraud_rfc.pkl", "wb"))