In [8]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

In [9]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [10]:
df_tx_bank= pd.read_csv('./../transaction_data.csv')
df_tx_bank = df_tx_bank.head(1000).copy()

In [11]:
df_processed = df_tx_bank.copy()


In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_processed['Transaction Status'] = le.fit_transform(df_processed['Transaction Status'])
df_processed['Transaction ID'] = le.fit_transform(df_processed['Transaction ID'])
df_processed['Transaction Type'] = le.fit_transform(df_processed['Transaction Type'])
df_processed['Sender Account ID'] = le.fit_transform(df_processed['Sender Account ID'])
df_processed['Receiver Account ID'] = le.fit_transform(df_processed['Receiver Account ID'])
df_processed['Device Used'] = le.fit_transform(df_processed['Device Used'])
df_processed['Network Slice ID'] = le.fit_transform(df_processed['Network Slice ID'])
df_processed['Fraud Flag'] = le.fit_transform(df_processed['Fraud Flag'])

df_processed['Timestamp'] = pd.to_datetime(df_processed['Timestamp'])
df_processed['Timestamp'] = df_processed['Timestamp'].dt.hour + df_processed['Timestamp'].dt.minute / 100



In [13]:
import re
df_reset = df_tx_bank.reset_index(drop=True)

for index, row in df_reset.iterrows():
    nums = re.findall(r'\d+\.?\d*', row['Geolocation (Latitude/Longitude)'])  # ['34.0522', '74.006']

    # Remove decimal points
    lat = nums[0].replace('.', '')  # '340522'
    lon = nums[1].replace('.', '')  # '74006'

    # Combine
    result = int(lat + lon)  # 34052274006
    df_processed.at[index, 'Geolocation (Latitude/Longitude)'] = result
  

In [14]:
df_processed.drop(columns=['PIN Code'], inplace=True)

df_processed

Unnamed: 0,Transaction ID,Sender Account ID,Receiver Account ID,Transaction Amount,Transaction Type,Timestamp,Transaction Status,Fraud Flag,Geolocation (Latitude/Longitude),Device Used,Network Slice ID,Latency (ms),Slice Bandwidth (Mbps)
0,949,57,88,495.90,0,10.14,0,1,34052274006,0,2,10,179
1,940,542,266,529.62,2,10.51,1,0,3568951182437,1,1,11,89
2,398,515,918,862.47,2,10.50,0,0,48856623522,1,0,4,53
3,158,409,745,1129.88,1,10.56,1,1,34052274006,1,2,10,127
4,376,567,17,933.24,0,10.25,1,1,557558376173,1,2,20,191
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,716,760,60,1340.76,1,10.49,1,0,3568951182437,1,1,19,224
996,159,954,469,483.36,2,11.00,0,1,557558376173,1,1,12,56
997,82,953,108,199.81,2,10.54,1,0,3405221396917,0,0,14,151
998,187,724,947,1341.86,1,10.59,1,1,5150741396917,1,2,8,95


In [15]:
#preprocessing

# scaler = StandardScaler()
X = df_processed.drop(columns=['Fraud Flag'], axis=1)

Y= df_processed['Fraud Flag']

xData = X.values 
yData = Y.values 

In [16]:
xTrain, xTest, yTrain, yTest = train_test_split(xData, yData, test_size = 0.2, random_state = 42) 

In [17]:
rfc = RandomForestClassifier(n_estimators=500,class_weight='balanced',random_state=123,max_depth=4) 
rfc.fit(xTrain,yTrain) 

y_pred = rfc.predict(xTest)

In [18]:
from sklearn.metrics import classification_report, accuracy_score  
from sklearn.metrics import precision_score, recall_score 
from sklearn.metrics import f1_score, matthews_corrcoef 
from sklearn.metrics import confusion_matrix 

# n_outliers = len(fraud) 
n_errors = (y_pred != yTest).sum() 
print("The model used is Random Forest classifier") 
  
acc = accuracy_score(yTest, y_pred) 
print("The accuracy is {}".format(acc)) 
  
prec = precision_score(yTest, y_pred) 
print("The precision is {}".format(prec)) 
  
rec = recall_score(yTest, y_pred) 
print("The recall is {}".format(rec)) 
  
f1 = f1_score(yTest, y_pred) 
print("The F1-Score is {}".format(f1)) 
  
MCC = matthews_corrcoef(yTest, y_pred) 
print("The Matthews correlation coefficient is{}".format(MCC)) 

The model used is Random Forest classifier
The accuracy is 0.485
The precision is 0.4392523364485981
The recall is 0.5222222222222223
The F1-Score is 0.47715736040609136
The Matthews correlation coefficient is-0.023172712614886102


In [19]:
# rfc_train_pred = (rfc.predict_proba(X_train)[:,1]>0.5).astype(bool)   # Set threshold to 0.5
# rfc_val_pred = (rfc.predict_proba(X_val)[:,1]>0.5).astype(bool)# Set threshold to 0.5

In [20]:
# from sklearn.metrics import confusion_matrix,accuracy_score,cohen_kappa_score,roc_auc_score,f1_score,roc_curve

# cm0 = confusion_matrix(y_train, rfc_train_pred,labels=[1,0])
# print('Confusion Matrix Train : \n', cm0)

# cm1 = confusion_matrix(y_val, rfc_val_pred,labels=[1,0])
# print('Confusion Matrix Test: \n', cm1)

# total0=sum(sum(cm0))
# total1=sum(sum(cm1))
# #####from confusion matrix calculate accuracy
# accuracy0=(cm0[0,0]+cm0[1,1])/total0
# print ('Accuracy Train : ', accuracy0)

# accuracy1=(cm1[0,0]+cm1[1,1])/total1
# print ('Accuracy Test : ', accuracy1)




# print("F1-Score Train",f1_score(y_train,rfc_train_pred))
# print("F1-Score Validation : ",f1_score(y_val, rfc_val_pred))