# GNN Anomaly Detection with GAE + GraphSAGE using BankSim

In [1]:

import pandas as pd

df = pd.read_csv('../data/bs140513_032310.csv')
print(df.head())
print(df['fraud'].value_counts())


   step       customer  age gender zipcodeOri       merchant zipMerchant  \
0     0  'C1093826151'  '4'    'M'    '28007'   'M348934600'     '28007'   
1     0   'C352968107'  '2'    'M'    '28007'   'M348934600'     '28007'   
2     0  'C2054744914'  '4'    'F'    '28007'  'M1823072687'     '28007'   
3     0  'C1760612790'  '3'    'M'    '28007'   'M348934600'     '28007'   
4     0   'C757503768'  '5'    'M'    '28007'   'M348934600'     '28007'   

              category  amount  fraud  
0  'es_transportation'    4.55      0  
1  'es_transportation'   39.68      0  
2  'es_transportation'   26.89      0  
3  'es_transportation'   17.25      0  
4  'es_transportation'   35.72      0  
fraud
0    587443
1      7200
Name: count, dtype: int64


In [2]:

import re

def get_numeric_id(customer_id):
    numeric_only = re.sub(r'[^0-9]', '', str(customer_id))
    return int(numeric_only) if numeric_only else 0

df['device_fp'] = df['customer'].apply(lambda x: f"fp_{get_numeric_id(x) % 1000}")


In [3]:

import networkx as nx

G = nx.Graph()
for _, row in df.iterrows():
    if row['fraud'] == 0:  # Use only normal transactions for training
        u = f"user_{row['customer']}"
        m = f"merch_{row['merchant']}"
        G.add_node(u, type='user')
        G.add_node(m, type='merchant')
        G.add_edge(u, m)


In [5]:

import torch
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data

data = from_networkx(G)
data.x = torch.eye(data.num_nodes)  # Identity features as placeholder


In [6]:

from torch_geometric.nn import GAE, SAGEConv

class GNNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, 2 * out_channels)
        self.conv2 = SAGEConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

encoder = GNNEncoder(data.num_node_features, 32)
model = GAE(encoder)


In [7]:

from torch_geometric.utils import train_test_split_edges

data = train_test_split_edges(data)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.train_pos_edge_index)
    loss = model.recon_loss(z, data.train_pos_edge_index)
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(1, 101):
    loss = train()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")




Epoch 10, Loss: 1.0605
Epoch 20, Loss: 0.8523
Epoch 30, Loss: 1.1409
Epoch 40, Loss: 0.9314
Epoch 50, Loss: 0.8461
Epoch 60, Loss: 0.8142
Epoch 70, Loss: 0.7971
Epoch 80, Loss: 0.7859
Epoch 90, Loss: 0.7766
Epoch 100, Loss: 0.7718


In [8]:

from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling

model.eval()
with torch.no_grad():
    z = model.encode(data.x, data.train_pos_edge_index)
    pos_pred = model.decoder(z, data.test_pos_edge_index).squeeze()

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index,
        num_nodes=z.size(0),
        num_neg_samples=data.test_pos_edge_index.size(1)
    )
    neg_pred = model.decoder(z, neg_edge_index).squeeze()

    y_true = torch.cat([torch.ones(pos_pred.size(0)), torch.zeros(neg_pred.size(0))])
    y_score = torch.cat([pos_pred, neg_pred])

    auc = roc_auc_score(y_true.cpu(), y_score.cpu())
    print(f"AUC Score: {auc:.4f}")


AUC Score: 0.9960
