# Install Packages

    Requirements for the fraud detection project using Graph Neural Networks, here's the command to install all the necessary packages

In [None]:
! pip install torch_geometric

# Fraud detection using GNN on synthetic transactions data

    This end-to-end implementation demonstrates how to use Graph Neural Networks for fraud detection in financial transactions. The GNN model can capture complex relationships in the data, potentially leading to improved fraud detection compared to traditional machine learning approaches.

# Step 1
    Data Loading and Preprocessing

    In this step, we load the synthetic transaction and identity datasets, merge them based on TransactionID, encode categorical variables, normalize numerical features, and split the data into training and test sets. This preprocessing is crucial for preparing the data for the GNN model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch_geometric.data import HeteroData

# Load the synthetic datasets
transactions_df = pd.read_csv('synthetic_transactions.csv')
identity_df = pd.read_csv('synthetic_identity.csv')

# Merge the datasets on TransactionID
merged_df = pd.merge(transactions_df, identity_df, on='TransactionID')

# Encode categorical variables
encoder = LabelEncoder()
categorical_cols = ['ProductCD', 'card_type', 'email_domain', 'DeviceID']
for col in categorical_cols:
    merged_df[col] = encoder.fit_transform(merged_df[col])

# Normalize numerical features
scaler = StandardScaler()
merged_df['TransactionAmt'] = scaler.fit_transform(merged_df[['TransactionAmt']])

# Split the data into train and test sets
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

print(merged_df.head())

   TransactionID  ProductCD  TransactionAmt       card_no  card_type  isFraud  \
0              1          2        0.680281  3863XXXX2177          2        0   
1              2          3        0.124621  5760XXXX1011          1        0   
2              3          0       -0.652243  6252XXXX5399          1        0   
3              4          2        1.076765  9234XXXX6445          1        0   
4              5          2        0.634218  7570XXXX9212          0        0   

   email_domain        IpAddress       PhoneNo  DeviceID  
0             1  246.191.145.201  593-800-9182       125  
1             2  161.245.212.205  819-213-5117       173  
2             0  220.240.157.141  448-825-5778       300  
3             1    46.35.253.134  934-665-3953       910  
4             1    82.115.197.47  909-419-1276       532  


# Step 2 
    Graph Construction

    This step constructs a heterogeneous graph from the preprocessed data. We create nodes for transactions, cards, emails, IP addresses, and devices, and establish edges between them. The graph structure allows the GNN to capture complex relationships between different entities in the transaction data.

In [4]:
def create_graph(df):
    data = HeteroData()
    
    # Add node features
    data['transaction'].x = torch.tensor(df[['ProductCD', 'TransactionAmt']].values, dtype=torch.float)
    data['card'].x = torch.tensor(df['card_type'].values.reshape(-1, 1), dtype=torch.float)
    data['email'].x = torch.tensor(df['email_domain'].values.reshape(-1, 1), dtype=torch.float)
    data['ip'].x = torch.tensor(df['IpAddress'].astype('category').cat.codes.values.reshape(-1, 1), dtype=torch.float)
    data['device'].x = torch.tensor(df['DeviceID'].values.reshape(-1, 1), dtype=torch.float)
    
    # Add edges
    num_nodes = len(df)
    data['transaction', 'uses', 'card'].edge_index = torch.tensor(np.array([range(num_nodes), range(num_nodes)]), dtype=torch.long)
    data['transaction', 'from', 'email'].edge_index = torch.tensor(np.array([range(num_nodes), range(num_nodes)]), dtype=torch.long)
    data['transaction', 'through', 'ip'].edge_index = torch.tensor(np.array([range(num_nodes), range(num_nodes)]), dtype=torch.long)
    data['transaction', 'via', 'device'].edge_index = torch.tensor(np.array([range(num_nodes), range(num_nodes)]), dtype=torch.long)
    
    # Add reverse edges
    data['card', 'rev_uses', 'transaction'].edge_index = data['transaction', 'uses', 'card'].edge_index.flip(0)
    data['email', 'rev_from', 'transaction'].edge_index = data['transaction', 'from', 'email'].edge_index.flip(0)
    data['ip', 'rev_through', 'transaction'].edge_index = data['transaction', 'through', 'ip'].edge_index.flip(0)
    data['device', 'rev_via', 'transaction'].edge_index = data['transaction', 'via', 'device'].edge_index.flip(0)
    
    # Add target
    data['transaction'].y = torch.tensor(df['isFraud'].values, dtype=torch.long)
    
    return data

train_data = create_graph(train_df)
test_data = create_graph(test_df)

# Step 3
    Define the GNN Model

    Here, we define the Graph Neural Network model using PyTorch Geometric. The model uses HeteroConv layers with SAGEConv operations to process the heterogeneous graph data. The model architecture includes two graph convolution layers with ReLU activation in between.

In [5]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, HeteroConv, Linear
from torch_geometric.nn import to_hetero

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = HeteroConv({
            ('transaction', 'uses', 'card'): SAGEConv((-1, -1), hidden_channels),
            ('transaction', 'from', 'email'): SAGEConv((-1, -1), hidden_channels),
            ('transaction', 'through', 'ip'): SAGEConv((-1, -1), hidden_channels),
            ('transaction', 'via', 'device'): SAGEConv((-1, -1), hidden_channels),
            ('card', 'rev_uses', 'transaction'): SAGEConv((-1, -1), hidden_channels),
            ('email', 'rev_from', 'transaction'): SAGEConv((-1, -1), hidden_channels),
            ('ip', 'rev_through', 'transaction'): SAGEConv((-1, -1), hidden_channels),
            ('device', 'rev_via', 'transaction'): SAGEConv((-1, -1), hidden_channels),
        })
        self.conv2 = HeteroConv({
            ('transaction', 'uses', 'card'): SAGEConv((-1, -1), out_channels),
            ('transaction', 'from', 'email'): SAGEConv((-1, -1), out_channels),
            ('transaction', 'through', 'ip'): SAGEConv((-1, -1), out_channels),
            ('transaction', 'via', 'device'): SAGEConv((-1, -1), out_channels),
            ('card', 'rev_uses', 'transaction'): SAGEConv((-1, -1), out_channels),
            ('email', 'rev_from', 'transaction'): SAGEConv((-1, -1), out_channels),
            ('ip', 'rev_through', 'transaction'): SAGEConv((-1, -1), out_channels),
            ('device', 'rev_via', 'transaction'): SAGEConv((-1, -1), out_channels),
        })

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, out_channels)
        self.lin = Linear(out_channels, 1)

    def forward(self, x_dict, edge_index_dict):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.lin(z_dict['transaction']).view(-1)

# Create the model
model = GNN(hidden_channels=64, out_channels=32)

print(model)

GNN(
  (encoder): GNNEncoder(
    (conv1): HeteroConv(num_relations=8)
    (conv2): HeteroConv(num_relations=8)
  )
  (lin): Linear(32, 1, bias=True)
)


# Step 4
    Training the Model

    This step defines the training process for the GNN model. We use Adam optimizer and CrossEntropyLoss as the loss function. The training loop runs for 200 epochs, updating the model parameters to minimize the loss on the training data.

In [6]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
train_data = train_data.to(device)
test_data = test_data.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(train_data.x_dict, train_data.edge_index_dict)
    loss = criterion(out, train_data['transaction'].y.float())
    loss.backward()
    optimizer.step()
    return float(loss)

for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 010, Loss: 42.6504
Epoch: 020, Loss: 38.6691
Epoch: 030, Loss: 9.7998
Epoch: 040, Loss: 33.6104
Epoch: 050, Loss: 47.6280
Epoch: 060, Loss: 44.1196
Epoch: 070, Loss: 35.5335
Epoch: 080, Loss: 26.8551
Epoch: 090, Loss: 19.4989
Epoch: 100, Loss: 13.5614
Epoch: 110, Loss: 8.6513
Epoch: 120, Loss: 4.4195
Epoch: 130, Loss: 0.5431
Epoch: 140, Loss: 2.5694
Epoch: 150, Loss: 1.2637
Epoch: 160, Loss: 0.3345
Epoch: 170, Loss: 0.2360
Epoch: 180, Loss: 1.1230
Epoch: 190, Loss: 0.3712
Epoch: 200, Loss: 6.7911


In [8]:
# Save the trained model
torch.save(model.state_dict(), 'fraud_detection_model.pth')
print("Model saved successfully.")

Model saved successfully.


# Step 5 
    Evaluation

    In the final step, we evaluate the trained model on both the training and test datasets. The test function computes the accuracy of the model's predictions. We report the accuracy on both the training and test sets to assess the model's performance and check for potential overfitting.

In [7]:
@torch.no_grad()
def test(data):
    model.eval()
    out = model(data.x_dict, data.edge_index_dict)
    pred = out.sigmoid().round()
    correct = (pred == data['transaction'].y).sum()
    acc = int(correct) / int(data['transaction'].y.shape[0])
    return acc

train_acc = test(train_data)
test_acc = test(test_data)
print(f'Train Accuracy: {train_acc:.4f}')
print(f'Test Accuracy: {test_acc:.4f}')

Train Accuracy: 0.9888
Test Accuracy: 0.9900


In [10]:
import torch
import pandas as pd
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the trained model
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = HeteroConv({
            ('transaction', 'uses', 'card'): SAGEConv((-1, -1), hidden_channels),
            ('transaction', 'from', 'email'): SAGEConv((-1, -1), hidden_channels),
            ('transaction', 'through', 'ip'): SAGEConv((-1, -1), hidden_channels),
            ('transaction', 'via', 'device'): SAGEConv((-1, -1), hidden_channels),
            ('card', 'rev_uses', 'transaction'): SAGEConv((-1, -1), hidden_channels),
            ('email', 'rev_from', 'transaction'): SAGEConv((-1, -1), hidden_channels),
            ('ip', 'rev_through', 'transaction'): SAGEConv((-1, -1), hidden_channels),
            ('device', 'rev_via', 'transaction'): SAGEConv((-1, -1), hidden_channels),
        })
        self.conv2 = HeteroConv({
            ('transaction', 'uses', 'card'): SAGEConv((-1, -1), out_channels),
            ('transaction', 'from', 'email'): SAGEConv((-1, -1), out_channels),
            ('transaction', 'through', 'ip'): SAGEConv((-1, -1), out_channels),
            ('transaction', 'via', 'device'): SAGEConv((-1, -1), out_channels),
            ('card', 'rev_uses', 'transaction'): SAGEConv((-1, -1), out_channels),
            ('email', 'rev_from', 'transaction'): SAGEConv((-1, -1), out_channels),
            ('ip', 'rev_through', 'transaction'): SAGEConv((-1, -1), out_channels),
            ('device', 'rev_via', 'transaction'): SAGEConv((-1, -1), out_channels),
        })

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, out_channels)
        self.lin = Linear(out_channels, 1)

    def forward(self, x_dict, edge_index_dict):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.lin(z_dict['transaction']).view(-1)


# Load the saved model
model = GNN(hidden_channels=64, out_channels=32)
model.load_state_dict(torch.load('fraud_detection_model.pth'))
model.eval()

# Initialize encoders and scaler (these should be the same as used in training)
encoder = LabelEncoder()
scaler = StandardScaler()

def get_user_input():
    print("Please enter the transaction details:")
    transaction = {
        'TransactionID': input("Transaction ID: "),
        'ProductCD': input("Product Code: "),
        'TransactionAmt': float(input("Transaction Amount: ")),
        'card_type': input("Card Type: "),
        'card_no': input("Card Number (last 4 digits): "),
        'email_domain': input("Email Domain: "),
        'IpAddress': input("IP Address: "),
        'DeviceID': input("Device ID: ")
    }
    return pd.DataFrame([transaction])

def preprocess_input(transaction_df):
    # Encode categorical variables (use the same encoding as in training)
    categorical_cols = ['ProductCD', 'card_type', 'email_domain', 'DeviceID']
    for col in categorical_cols:
        transaction_df[col] = encoder.fit_transform(transaction_df[col].astype(str))

    # Normalize TransactionAmt (use the same scaling as in training)
    transaction_df['TransactionAmt'] = scaler.fit_transform(transaction_df[['TransactionAmt']])

    return transaction_df

def create_graph_data(transaction_df):
    data = HeteroData()
    
    # Add node features
    data['transaction'].x = torch.tensor(transaction_df[['ProductCD', 'TransactionAmt']].values, dtype=torch.float)
    data['card'].x = torch.tensor(transaction_df['card_type'].values.reshape(-1, 1), dtype=torch.float)
    data['email'].x = torch.tensor(transaction_df['email_domain'].values.reshape(-1, 1), dtype=torch.float)
    data['ip'].x = torch.tensor(transaction_df['IpAddress'].astype('category').cat.codes.values.reshape(-1, 1), dtype=torch.float)
    data['device'].x = torch.tensor(transaction_df['DeviceID'].values.reshape(-1, 1), dtype=torch.float)
    
    # Add edges (assuming single transaction, so all indices are 0)
    data['transaction', 'uses', 'card'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    data['transaction', 'from', 'email'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    data['transaction', 'through', 'ip'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    data['transaction', 'via', 'device'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    data['card', 'rev_uses', 'transaction'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    data['email', 'rev_from', 'transaction'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    data['ip', 'rev_through', 'transaction'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    data['device', 'rev_via', 'transaction'].edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    
    return data

def predict_fraud(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        prob = torch.sigmoid(out)
        return prob.item()

# Main prediction flow
if __name__ == "__main__":
    # Get user input
    transaction_df = get_user_input()
    
    # Preprocess the input
    preprocessed_df = preprocess_input(transaction_df)
    
    # Create graph data
    graph_data = create_graph_data(preprocessed_df)
    
    # Make prediction
    fraud_probability = predict_fraud(model, graph_data)
    
    print(f"Fraud probability for the transaction: {fraud_probability:.4f}")
    if fraud_probability > 0.5:
        print("This transaction is likely to be fraudulent.")
    else:
        print("This transaction appears to be legitimate.")

  model.load_state_dict(torch.load('fraud_detection_model.pth'))


Please enter the transaction details:
Fraud probability for the transaction: 0.0215
This transaction appears to be legitimate.
