<a href="https://colab.research.google.com/github/Joothis/Fraud-Detection-Using-GANs-and-Deep-Learning/blob/main/Fraud_Detection_Using_Generative_Adversarial_Networks_(GANs)_and_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision torchaudio transformers scikit-learn pandas numpy matplotlib seaborn optuna shap

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score

**Step 2: Load & Preprocess Data**







In [4]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Project/Fraud_Deduction_Dataset.csv')


Mounted at /content/drive


In [5]:
df.head()
df.info()
df.describe()
df.isnull().sum()  # Check missing values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [6]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])


In [7]:
print(df.columns)


Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')


In [8]:
df.rename(columns={'isFraud': 'fraud_label'}, inplace=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=['fraud_label']))
df_scaled = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_scaled['fraud_label'] = df['fraud_label']


Step 3: Address Imbalance using **GANs**

In [9]:
fraud_df = df_scaled[df_scaled['fraud_label'] == 1]
non_fraud_df = df_scaled[df_scaled['fraud_label'] == 0]

X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop(columns=['fraud_label']), df_scaled['fraud_label'], test_size=0.2, random_state=42)

In [10]:
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),  # Ensure output_dim matches dataset features
            nn.Sigmoid()  # Keeps values in range
        )

    def forward(self, z):
        return self.model(z)

# Fix latent_dim and output_dim
latent_dim = 10  # Input noise vector size
data_dim = X_train.shape[1]  # Ensure it matches the number of features

generator = Generator(latent_dim, data_dim)


In [11]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Ensure discriminator input matches dataset feature count
discriminator = Discriminator(data_dim)


In [12]:
# Generate fake data
z = torch.randn(64, latent_dim)  # Generate random noise
fake_data = generator(z)  # Output of Generator

# Get real fraud samples from dataset
real_data = torch.tensor(X_train.sample(64).values, dtype=torch.float32)

# Print shapes to verify match
print("Fake Data Shape (Generator Output):", fake_data.shape)  # Should be (64, 9)
print("Real Data Shape (From Dataset):", real_data.shape)  # Should be (64, 9)


Fake Data Shape (Generator Output): torch.Size([64, 9])
Real Data Shape (From Dataset): torch.Size([64, 9])


In [17]:
batch_size = 64

for epoch in range(500):  # Number of training epochs
    # Generate fake fraud samples
    z = torch.randn(batch_size, latent_dim)  # Latent noise
    fake_data = generator(z)  # Generator output

    # Get real fraud transactions
    real_data = torch.tensor(X_train.sample(batch_size).values, dtype=torch.float32)

    # Labels
    real_labels = torch.ones(batch_size, 1)  # Real transactions = 1
    fake_labels = torch.zeros(batch_size, 1)  # Fake transactions = 0

    # Adam optimizer is often used for GANs
    optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)
    optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)

    # Train Discriminator
    optimizer_D.zero_grad()
    criterion = nn.BCELoss()
    loss_real = criterion(discriminator(real_data), real_labels)
    loss_fake = criterion(discriminator(fake_data.detach()), fake_labels)
    loss_D = (loss_real + loss_fake) / 2
    loss_D.backward()
    optimizer_D.step()

    # Train Generator
    optimizer_G.zero_grad()
    loss_G = criterion(discriminator(fake_data), real_labels)  # Wants to fool Discriminator
    loss_G.backward()
    optimizer_G.step()

    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/500], Loss D: {loss_D.item()}, Loss G: {loss_G.item()}')


Epoch [0/500], Loss D: 0.6750174760818481, Loss G: 0.7291713356971741
Epoch [100/500], Loss D: 0.2554997503757477, Loss G: 1.1181671619415283
Epoch [200/500], Loss D: 0.06593900918960571, Loss G: 2.32236647605896
Epoch [300/500], Loss D: 0.004839855711907148, Loss G: 4.725375175476074
Epoch [400/500], Loss D: 0.00016250622866209596, Loss G: 8.131942749023438


 Step 4: Train Transformer-based Fraud Classifier

In [18]:
print("Number of Features in X_train:", X_train.shape[1])


Number of Features in X_train: 9


In [19]:
generated_fraud = generator(torch.randn(1000, latent_dim)).detach().numpy()
new_fraud_df = pd.DataFrame(generated_fraud, columns=df_scaled.columns[:-1])
new_fraud_df['fraud_label'] = 1
df_balanced = pd.concat([df_scaled, new_fraud_df], axis=0)


In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, activation='relu', input_shape=(data_dim,)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m79533/79533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 4ms/step - accuracy: 0.9988 - loss: 0.0069 - val_accuracy: 0.9994 - val_loss: 0.0035
Epoch 2/30
[1m79533/79533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 4ms/step - accuracy: 0.9993 - loss: 0.0038 - val_accuracy: 0.9994 - val_loss: 0.0034
Epoch 3/30
[1m79533/79533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 4ms/step - accuracy: 0.9993 - loss: 0.0036 - val_accuracy: 0.9993 - val_loss: 0.0029
Epoch 4/30
[1m79533/79533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 4ms/step - accuracy: 0.9993 - loss: 0.0039 - val_accuracy: 0.9994 - val_loss: 0.0029
Epoch 5/30
[1m79533/79533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 4ms/step - accuracy: 0.9994 - loss: 0.0043 - val_accuracy: 0.9994 - val_loss: 0.0027
Epoch 6/30
[1m79533/79533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 4ms/step - accuracy: 0.9994 - loss: 0.0036 - val_accuracy: 0.9994 - val_loss:

<keras.src.callbacks.history.History at 0x7faf43df3810>

Step 5: Model Evaluation

In [21]:
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

print(classification_report(y_test, y_pred_classes))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))


[1m39767/39767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270904
           1       1.00      0.55      0.71      1620

    accuracy                           1.00   1272524
   macro avg       1.00      0.78      0.86   1272524
weighted avg       1.00      1.00      1.00   1272524

ROC-AUC Score: 0.9934032595967656


Step 6: Explainability using SHAP

In [None]:
import shap
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)