In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support

In [2]:
df = pd.read_csv('onlinefraud.csv')
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [3]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [4]:

X = df.drop(['step','nameOrig', 'nameDest', 'isFraud'], axis=1)
y = df['isFraud']
X = pd.get_dummies(X, columns=['type'], drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
) # stratify make sure there is 80% of each label 0 and 1 in both data sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # use training data parameters to scale and normalize data 
X_test_scaled = scaler.transform(X_test) # apply same calcualted mean and std to test set asw


In [5]:
rf_model = RandomForestClassifier(
    n_estimators=20,   
    max_depth=10,      
    random_state=42,
    n_jobs=1,
    class_weight='balanced'
)
# 20 decision tress, each tree max depth = 10, n_jobs uses only one cpu core, 
# if there is a majority of a single class, balances it and prevents model from predicting that class all the time

rf_model.fit(X_train_scaled, y_train)


0,1,2
,n_estimators,20
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
y_pred = rf_model.predict(X_test_scaled)
y_proba = rf_model.predict_proba(X_test_scaled)[:, 1]  
# predict proba returns probability of each class i.e probability of 0 and 1 
# we are taking only probability of 1 in an array that is probabilty of fraud

In [10]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred)
print("\nPrecision Matrix (class 0    class 1):") 
print("Precision:", precision) # precision is TP/(TP+FP)
print("Recall:", recall) # recall is TP/(TP+FN)
print("F1 Score:", f1) # harmonic mean of precision and recall
print("Support:", support) # actual number of samples of each class in the test data set


auc_score = roc_auc_score(y_test, y_proba) # roc is plot of true positive rate vs false positive rate, aoc is area under the curve
print(f"\nROC-AUC Score: {auc_score:.3f}") # aoc belongs between 0 and 1, higher it is better it is.

Confusion Matrix:
 [[1252505   18376]
 [     15    1628]]

Precision Matrix (class 0    class 1):
Precision: [0.99998802 0.08138372]
Recall: [0.98554074 0.99087036]
F1 Score: [0.99271182 0.15041345]
Support: [1270881    1643]

ROC-AUC Score: 0.999


In [13]:
#Feature Importance using gini impurity or entropy
importances = rf_model.feature_importances_
feature_names = X.columns
# sklearn normalizes all feature importance values so that their sum = 1 
indices = np.argsort(importances)[::-1] # argsort sorts in the incdices so that their values are in asceding order and [::-1] reverses it

print("Top 5 Important Features:")
for i in range(5):
    print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Top 5 Important Features:
oldbalanceOrg: 0.3402
newbalanceOrig: 0.1894
amount: 0.1719
type_TRANSFER: 0.1028
type_PAYMENT: 0.0536
