In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [10]:
# Load data
data = pd.read_csv(r"C:\Users\gagan\Downloads\Data\Winter 23-24\Capstone\fraud-transactions-detection\JPMorgan\SupervisedFraudData.csv")
data.head(3)

Unnamed: 0,Transaction_Id,Sender_Id,Sender_Account,Sender_Country,Sender_Sector,Sender_lob,Bene_Id,Bene_Account,Bene_Country,USD_amount,Label,Transaction_Type,Date,Time
0,EXCHANGE-10115,JPMC-CLIENT-10098,ACCOUNT-10108,USA,35537,CCB,Unknown,Unknown,Unknown,558.43,0,WITHDRAWAL,2022-01-15,37440
1,QUICK-PAYMENT-10116,JPMC-CLIENT-10098,ACCOUNT-10109,USA,15287,CCB,CLIENT-10100,ACCOUNT-10106,CANADA,622.78,0,QUICK-PAYMENT,2022-01-15,37440
2,DEPOSIT-CASH-9833,Unknown,Unknown,Unknown,0,Unknown,JPMC-CLIENT-9812,ACCOUNT-9826,USA,802.54,0,DEPOSIT-CASH,2022-01-15,37440


In [11]:
data.columns

Index(['Transaction_Id', 'Sender_Id', 'Sender_Account', 'Sender_Country',
       'Sender_Sector', 'Sender_lob', 'Bene_Id', 'Bene_Account',
       'Bene_Country', 'USD_amount', 'Label', 'Transaction_Type', 'Date',
       'Time'],
      dtype='object')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498177 entries, 0 to 1498176
Data columns (total 14 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Transaction_Id    1498177 non-null  object 
 1   Sender_Id         1498177 non-null  object 
 2   Sender_Account    1498177 non-null  object 
 3   Sender_Country    1498177 non-null  object 
 4   Sender_Sector     1498177 non-null  int64  
 5   Sender_lob        1498177 non-null  object 
 6   Bene_Id           1498177 non-null  object 
 7   Bene_Account      1498177 non-null  object 
 8   Bene_Country      1498177 non-null  object 
 9   USD_amount        1498177 non-null  float64
 10  Label             1498177 non-null  int64  
 11  Transaction_Type  1498177 non-null  object 
 12  Date              1498177 non-null  object 
 13  Time              1498177 non-null  int64  
dtypes: float64(1), int64(3), object(10)
memory usage: 160.0+ MB


# Data preprocessing

In [13]:
from datetime import datetime
def convert_to_unix(date_string):
    date_object = datetime.strptime(date_string, "%Y-%m-%d")
    return int(date_object.timestamp())

# Apply the function to the DataFrame column 'Date'
data['Unix_Timestamp'] = data['Date'].apply(convert_to_unix)

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the 'Unix_Timestamp' and 'Time' columns
data[['Unix_Timestamp', 'Time']] = scaler.fit_transform(data[['Unix_Timestamp', 'Time']])

In [15]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ['Sender_Country', 'Sender_lob', 'Bene_Country', 'Transaction_Type']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [16]:
# Normalize numerical features
scaler = StandardScaler()
data['USD_amount'] = scaler.fit_transform(data['USD_amount'].values.reshape(-1, 1))

In [17]:
# Split data into features and labels
X = data.drop(['Label', 'Transaction_Id', 'Sender_Id', 'Bene_Id', 'Bene_Account', 'Bene_Country','Sender_Account','Bene_Account','Date'], axis=1)
y = data['Label']

In [18]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Model training and evaluation

In [19]:
# Multi-Layer Perceptron (MLP)
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=100, random_state=42)
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)
print("MLP Classifier:")
print(classification_report(y_test, mlp_pred))

MLP Classifier:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    293472
           1       0.83      0.01      0.02      6164

    accuracy                           0.98    299636
   macro avg       0.90      0.51      0.50    299636
weighted avg       0.98      0.98      0.97    299636



In [20]:
# AdaBoost
adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost.fit(X_train, y_train)
adaboost_pred = adaboost.predict(X_test)
print("AdaBoost Classifier:")
print(classification_report(y_test, adaboost_pred))



AdaBoost Classifier:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    293472
           1       0.94      0.03      0.06      6164

    accuracy                           0.98    299636
   macro avg       0.96      0.52      0.53    299636
weighted avg       0.98      0.98      0.97    299636



In [21]:
# Naive Bayes
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
nb_pred = naive_bayes.predict(X_test)
print("Naive Bayes Classifier:")
print(classification_report(y_test, nb_pred))

Naive Bayes Classifier:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    293472
           1       0.94      0.03      0.06      6164

    accuracy                           0.98    299636
   macro avg       0.96      0.52      0.53    299636
weighted avg       0.98      0.98      0.97    299636



In [22]:
# Neural Network with TensorFlow
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m29964/29964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 639us/step - accuracy: 0.9642 - loss: 6.4172 - val_accuracy: 0.9793 - val_loss: 0.1066
Epoch 2/10
[1m29964/29964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 960us/step - accuracy: 0.9793 - loss: 0.1213 - val_accuracy: 0.9794 - val_loss: 0.1005
Epoch 3/10
[1m29964/29964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 653us/step - accuracy: 0.9798 - loss: 0.0987 - val_accuracy: 0.9794 - val_loss: 0.1003
Epoch 4/10
[1m29964/29964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 650us/step - accuracy: 0.9796 - loss: 0.0996 - val_accuracy: 0.9793 - val_loss: 0.1004
Epoch 5/10
[1m29964/29964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 716us/step - accuracy: 0.9796 - loss: 0.0996 - val_accuracy: 0.9794 - val_loss: 0.1005
Epoch 6/10
[1m29964/29964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 691us/step - accuracy: 0.9798 - loss: 0.0988 - val_accuracy: 0.9794 - val_loss: 0.10

In [23]:
# Evaluate Neural Network
nn_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Neural Network:")
print(classification_report(y_test, nn_pred))

[1m9364/9364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 877us/step
Neural Network:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    293472
           1       1.00      0.01      0.01      6164

    accuracy                           0.98    299636
   macro avg       0.99      0.50      0.50    299636
weighted avg       0.98      0.98      0.97    299636

