In [13]:
# Notebook: 02-Model-Training-GNN-PyTorch-3.ipynb (Extended for IP & Multi-Task)
# =============================================================================

# --------------------------------------------------------------------------------
# Cell 1: Imports & Setup
# --------------------------------------------------------------------------------
import os
import json
import numpy as np

import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, SAGEConv, Linear

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cpu


In [14]:
# --------------------------------------------------------------------------------
# Cell 2: Configuration
# --------------------------------------------------------------------------------
PROCESSED_DIR = (
    "/Users/harshil/Development/"
    "GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/"
    "verishield_ml_experiments/data_generators/data/medium_fraud/processed_gnn"
)

# If you want multi-task classification (user + business):
MULTI_TASK = True   # or False if only classifying user nodes

# If you want IP classification (less common, but you can experiment):
IP_CLASSIFICATION = True

HIDDEN_DIM = 64
LEARNING_RATE = 1e-3
EPOCHS = 10
STEPS_PER_EPOCH = 120  # multiple gradient updates per epoch
PRINT_EVERY = 1

print(f"Processed GNN data at: {PROCESSED_DIR}")
print(f"MULTI_TASK={MULTI_TASK}, IP_CLASSIFICATION={IP_CLASSIFICATION}")
print(f"HIDDEN_DIM={HIDDEN_DIM}, LR={LEARNING_RATE}, EPOCHS={EPOCHS}, STEPS_PER_EPOCH={STEPS_PER_EPOCH}")

Processed GNN data at: /Users/harshil/Development/GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/verishield_ml_experiments/data_generators/data/medium_fraud/processed_gnn
MULTI_TASK=True, IP_CLASSIFICATION=True
HIDDEN_DIM=64, LR=0.001, EPOCHS=10, STEPS_PER_EPOCH=120


In [15]:
# --------------------------------------------------------------------------------
# Cell 3: Loading .npy Arrays
# --------------------------------------------------------------------------------
def load_npy(filename):
    path = os.path.join(PROCESSED_DIR, filename)
    arr = np.load(path)
    print(f"Loaded {filename}, shape={arr.shape}, dtype={arr.dtype}")
    return arr

user_features  = torch.from_numpy(load_npy("user_features.npy")).float()
user_labels    = torch.from_numpy(load_npy("user_labels.npy")).long()
biz_features   = torch.from_numpy(load_npy("biz_features.npy")).float()
biz_labels     = torch.from_numpy(load_npy("biz_labels.npy")).long()
ip_features    = torch.from_numpy(load_npy("ip_features.npy")).float()
ip_labels      = torch.from_numpy(load_npy("ip_labels.npy")).long()

edge_user_user = torch.from_numpy(load_npy("edge_user_user.npy")).long()  # [2, E_uu]
edge_user_biz  = torch.from_numpy(load_npy("edge_user_biz.npy")).long()   # [2, E_ub]
edge_user_ip   = torch.from_numpy(load_npy("edge_user_ip.npy")).long()    # [2, E_ui]

# Masks: user
train_mask_users = None
val_mask_users   = None
test_mask_users  = None
try:
    train_mask_users = torch.from_numpy(load_npy("train_mask_users.npy")).bool()
    val_mask_users   = torch.from_numpy(load_npy("val_mask_users.npy")).bool()
    test_mask_users  = torch.from_numpy(load_npy("test_mask_users.npy")).bool()
except FileNotFoundError:
    print("No user masks found. Using full-batch for user classification if MULTI_TASK=False.")

# Masks: business
train_mask_biz = None
val_mask_biz   = None
test_mask_biz  = None
if MULTI_TASK:
    try:
        train_mask_biz = torch.from_numpy(load_npy("train_mask_biz.npy")).bool()
        val_mask_biz   = torch.from_numpy(load_npy("val_mask_biz.npy")).bool()
        test_mask_biz  = torch.from_numpy(load_npy("test_mask_biz.npy")).bool()
    except FileNotFoundError:
        print("No business masks found. MULTI_TASK might not work properly without them.")

# Masks: IP
train_mask_ip = None
val_mask_ip   = None
test_mask_ip  = None
if IP_CLASSIFICATION:
    try:
        train_mask_ip = torch.from_numpy(load_npy("train_mask_ip.npy")).bool()
        val_mask_ip   = torch.from_numpy(load_npy("val_mask_ip.npy")).bool()
        test_mask_ip  = torch.from_numpy(load_npy("test_mask_ip.npy")).bool()
    except FileNotFoundError:
        print("No IP masks found. IP classification won't be possible without them.")

# Optionally load metadata.json
meta_path = os.path.join(PROCESSED_DIR, "metadata.json")
if os.path.isfile(meta_path):
    with open(meta_path, "r") as f:
        metadata = json.load(f)
    print("\nmetadata.json contents:")
    for k,v in metadata.items():
        print(f"  {k}: {v}")

Loaded user_features.npy, shape=(100000, 7), dtype=float32
Loaded user_labels.npy, shape=(100000,), dtype=int64
Loaded biz_features.npy, shape=(10000, 3), dtype=float32
Loaded biz_labels.npy, shape=(10000,), dtype=int64
Loaded ip_features.npy, shape=(5000, 1), dtype=float32
Loaded ip_labels.npy, shape=(5000,), dtype=int64
Loaded edge_user_user.npy, shape=(2, 10274), dtype=int64
Loaded edge_user_biz.npy, shape=(2, 221613), dtype=int64
Loaded edge_user_ip.npy, shape=(2, 100000), dtype=int64
Loaded train_mask_users.npy, shape=(100000,), dtype=bool
Loaded val_mask_users.npy, shape=(100000,), dtype=bool
Loaded test_mask_users.npy, shape=(100000,), dtype=bool
Loaded train_mask_biz.npy, shape=(10000,), dtype=bool
Loaded val_mask_biz.npy, shape=(10000,), dtype=bool
Loaded test_mask_biz.npy, shape=(10000,), dtype=bool
Loaded train_mask_ip.npy, shape=(5000,), dtype=bool
Loaded val_mask_ip.npy, shape=(5000,), dtype=bool
Loaded test_mask_ip.npy, shape=(5000,), dtype=bool

metadata.json contents:
 

In [16]:
# --------------------------------------------------------------------------------
# Cell 4: Build a HeteroData Graph (Users, Businesses, IPs)
# --------------------------------------------------------------------------------
data = HeteroData()

# user nodes
data['user'].x = user_features
data['user'].y = user_labels
if train_mask_users is not None:
    data['user'].train_mask = train_mask_users
    data['user'].val_mask   = val_mask_users
    data['user'].test_mask  = test_mask_users

# business nodes
data['business'].x = biz_features
data['business'].y = biz_labels
if MULTI_TASK and train_mask_biz is not None:
    data['business'].train_mask = train_mask_biz
    data['business'].val_mask   = val_mask_biz
    data['business'].test_mask  = test_mask_biz

# ip nodes
data['ip'].x = ip_features
data['ip'].y = ip_labels
if IP_CLASSIFICATION and train_mask_ip is not None:
    data['ip'].train_mask  = train_mask_ip
    data['ip'].val_mask    = val_mask_ip
    data['ip'].test_mask   = test_mask_ip

# Edges
data[('user','user_user','user')].edge_index = edge_user_user
data[('user','user_business','business')].edge_index = edge_user_biz
data[('user','user_ip','ip')].edge_index = edge_user_ip

# For better message passing, define reverse edges if you want user ← business, user ← ip messages:
# e.g., business->user is basically edge_user_biz flipped along dimension 0
rev_user_biz = torch.flip(edge_user_biz, dims=[0])
data[('business','rev_user_business','user')].edge_index = rev_user_biz

rev_user_ip = torch.flip(edge_user_ip, dims=[0])
data[('ip','rev_user_ip','user')].edge_index = rev_user_ip

print("\nHeteroData object created with the following node_types and edge_types:")
print("Node types:", list(data.node_types))
print("Edge types:", list(data.edge_types))

# Move entire data to GPU
data = data.to(device)


HeteroData object created with the following node_types and edge_types:
Node types: ['user', 'business', 'ip']
Edge types: [('user', 'user_user', 'user'), ('user', 'user_business', 'business'), ('user', 'user_ip', 'ip'), ('business', 'rev_user_business', 'user'), ('ip', 'rev_user_ip', 'user')]


In [17]:
# --------------------------------------------------------------------------------
# Cell 5: Define a GNN Model (Supports Multi-Edges & Optional Multi-Task)
# --------------------------------------------------------------------------------
class FraudGNN(torch.nn.Module):
    def __init__(self, hidden_dim=64, user_out=2, biz_out=2, ip_out=2, multi_task=False, ip_task=False):
        """
        multi_task=True => We'll produce a user head & a business head
        ip_task=True    => We'll also produce an ip head

        Otherwise, we can do a single user head only.
        """
        super().__init__()
        self.multi_task = multi_task
        self.ip_task = ip_task

        # We define a single HeteroConv for all relevant edges:
        self.conv1 = HeteroConv({
            ('user','user_user','user'): SAGEConv((-1, -1), hidden_dim),
            ('user','user_business','business'): SAGEConv((-1, -1), hidden_dim),
            ('business','rev_user_business','user'): SAGEConv((-1, -1), hidden_dim),
            ('user','user_ip','ip'): SAGEConv((-1, -1), hidden_dim),
            ('ip','rev_user_ip','user'): SAGEConv((-1, -1), hidden_dim),
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('user','user_user','user'): SAGEConv((-1, -1), hidden_dim),
            ('user','user_business','business'): SAGEConv((-1, -1), hidden_dim),
            ('business','rev_user_business','user'): SAGEConv((-1, -1), hidden_dim),
            ('user','user_ip','ip'): SAGEConv((-1, -1), hidden_dim),
            ('ip','rev_user_ip','user'): SAGEConv((-1, -1), hidden_dim),
        }, aggr='mean')

        # We'll get embeddings for user, business, ip from the final layer,
        # then optionally define separate linear heads:
        self.user_lin = Linear(hidden_dim, user_out)

        if multi_task:
            self.business_lin = Linear(hidden_dim, biz_out)
        else:
            self.business_lin = None

        if ip_task:
            self.ip_lin = Linear(hidden_dim, ip_out)
        else:
            self.ip_lin = None

    def forward(self, x_dict, edge_index_dict):
        # 1st round
        x_dict = self.conv1(x_dict, edge_index_dict)
        for ntype, x in x_dict.items():
            x_dict[ntype] = F.relu(x)

        # 2nd round
        x_dict = self.conv2(x_dict, edge_index_dict)
        for ntype, x in x_dict.items():
            x_dict[ntype] = F.relu(x)

        return x_dict  # dictionary of embeddings

    def forward_user(self, emb_dict):
        # user_emb => [num_users, hidden_dim]
        user_emb = emb_dict['user']
        return self.user_lin(user_emb)

    def forward_business(self, emb_dict):
        if self.business_lin is None:
            return None
        biz_emb = emb_dict['business']
        return self.business_lin(biz_emb)

    def forward_ip(self, emb_dict):
        if self.ip_lin is None:
            return None
        ip_emb = emb_dict['ip']
        return self.ip_lin(ip_emb)

# Instantiate
model = FraudGNN(
    hidden_dim=HIDDEN_DIM,
    user_out=2,       # (legit/fraud)
    biz_out=2,        # (legit/fraud) if multi-task
    ip_out=2,         # if ip classification
    multi_task=MULTI_TASK,
    ip_task=IP_CLASSIFICATION
).to(device)

print(model)

FraudGNN(
  (conv1): HeteroConv(num_relations=5)
  (conv2): HeteroConv(num_relations=5)
  (user_lin): Linear(64, 2, bias=True)
  (business_lin): Linear(64, 2, bias=True)
  (ip_lin): Linear(64, 2, bias=True)
)


In [18]:
# --------------------------------------------------------------------------------
# Cell 6: Training Setup (Multi-Task or Single)
# --------------------------------------------------------------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

def train_step():
    """One step of training on the entire graph (masked to train sets)."""
    model.train()
    optimizer.zero_grad()

    emb_dict = model(data.x_dict, data.edge_index_dict)

    # User classification
    user_logits = model.forward_user(emb_dict)
    train_mask_u = data['user'].train_mask if 'train_mask' in data['user'] else None
    user_loss = 0.
    if train_mask_u is not None:
        masked_logits = user_logits[train_mask_u]
        masked_labels = data['user'].y[train_mask_u]
        user_loss = F.cross_entropy(masked_logits, masked_labels)
    else:
        # If no user mask, assume all are train
        user_loss = F.cross_entropy(user_logits, data['user'].y)

    # Business classification (only if multi_task=True)
    biz_loss = 0.
    if model.multi_task and 'train_mask' in data['business']:
        business_logits = model.forward_business(emb_dict)
        train_mask_b = data['business'].train_mask
        masked_biz_logits = business_logits[train_mask_b]
        masked_biz_labels = data['business'].y[train_mask_b]
        biz_loss = F.cross_entropy(masked_biz_logits, masked_biz_labels)

    # IP classification (only if ip_task=True)
    ip_loss = 0.
    if model.ip_task and 'train_mask' in data['ip']:
        ip_logits = model.forward_ip(emb_dict)
        train_mask_ip = data['ip'].train_mask
        masked_ip_logits = ip_logits[train_mask_ip]
        masked_ip_labels = data['ip'].y[train_mask_ip]
        ip_loss = F.cross_entropy(masked_ip_logits, masked_ip_labels)

    total_loss = user_loss + biz_loss + ip_loss
    total_loss.backward()
    optimizer.step()
    return float(total_loss)

@torch.no_grad()
def evaluate_user(mask_name='val_mask'):
    """Compute user accuracy on val/test mask."""
    model.eval()
    emb_dict = model(data.x_dict, data.edge_index_dict)
    user_logits = model.forward_user(emb_dict)

    if mask_name not in data['user']:
        # no mask => evaluate on all
        preds = user_logits.argmax(dim=-1)
        correct = (preds == data['user'].y).sum()
        acc = correct / len(data['user'].y)
        return float(acc)
    mask = data['user'][mask_name]
    logits_masked = user_logits[mask]
    labels_masked = data['user'].y[mask]
    preds = logits_masked.argmax(dim=-1)
    correct = (preds == labels_masked).sum()
    acc = correct / mask.sum()
    return float(acc)

@torch.no_grad()
def evaluate_biz(mask_name='val_mask'):
    """Compute business accuracy."""
    if not model.multi_task or mask_name not in data['business']:
        return 0.0
    model.eval()
    emb_dict = model(data.x_dict, data.edge_index_dict)
    biz_logits = model.forward_business(emb_dict)
    mask = data['business'][mask_name]
    logits_masked = biz_logits[mask]
    labels_masked = data['business'].y[mask]
    preds = logits_masked.argmax(dim=-1)
    correct = (preds == labels_masked).sum()
    acc = correct / mask.sum()
    return float(acc)

@torch.no_grad()
def evaluate_ip(mask_name='val_mask'):
    """Compute IP accuracy."""
    if not model.ip_task or mask_name not in data['ip']:
        return 0.0
    model.eval()
    emb_dict = model(data.x_dict, data.edge_index_dict)
    ip_logits = model.forward_ip(emb_dict)
    mask = data['ip'][mask_name]
    logits_masked = ip_logits[mask]
    labels_masked = data['ip'].y[mask]
    preds = logits_masked.argmax(dim=-1)
    correct = (preds == labels_masked).sum()
    acc = correct / mask.sum()
    return float(acc)

In [19]:
# --------------------------------------------------------------------------------
# Cell 7: Main Training Loop
# --------------------------------------------------------------------------------
for epoch in range(1, EPOCHS+1):
    avg_loss = 0.0
    for step in range(STEPS_PER_EPOCH):
        loss_val = train_step()
        avg_loss += loss_val
    avg_loss /= STEPS_PER_EPOCH

    # Evaluate user
    val_acc_user = evaluate_user('val_mask')
    # Evaluate business if multi-task
    val_acc_biz  = evaluate_biz('val_mask') if MULTI_TASK else 0.0
    # Evaluate IP if ip_task
    val_acc_ip   = evaluate_ip('val_mask') if IP_CLASSIFICATION else 0.0

    if epoch % PRINT_EVERY == 0:
        print(f"Epoch {epoch}/{EPOCHS} => loss={avg_loss:.4f}, "
              f"user_val_acc={val_acc_user:.4f}, biz_val_acc={val_acc_biz:.4f}, ip_val_acc={val_acc_ip:.4f}")

print("\nFinal Evaluations:")
user_test_acc = evaluate_user('test_mask')
print(f"User Test Accuracy: {user_test_acc:.4f}")
if MULTI_TASK:
    biz_test_acc = evaluate_biz('test_mask')
    print(f"Business Test Accuracy: {biz_test_acc:.4f}")
if IP_CLASSIFICATION:
    ip_test_acc = evaluate_ip('test_mask')
    print(f"IP Test Accuracy: {ip_test_acc:.4f}")

Epoch 1/10 => loss=0.7787, user_val_acc=0.7445, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 2/10 => loss=0.6784, user_val_acc=0.7445, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 3/10 => loss=0.6745, user_val_acc=0.7445, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 4/10 => loss=0.6687, user_val_acc=0.7440, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 5/10 => loss=0.6624, user_val_acc=0.7427, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 6/10 => loss=0.6567, user_val_acc=0.7435, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 7/10 => loss=0.6528, user_val_acc=0.7415, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 8/10 => loss=0.6500, user_val_acc=0.7405, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 9/10 => loss=0.6474, user_val_acc=0.7409, biz_val_acc=0.9653, ip_val_acc=1.0000
Epoch 10/10 => loss=0.6449, user_val_acc=0.7407, biz_val_acc=0.9653, ip_val_acc=1.0000

Final Evaluations:
User Test Accuracy: 0.7470
Business Test Accuracy: 0.9713
IP Test Accuracy: 1.0000


In [20]:
# --------------------------------------------------------------------------------
# Cell 8: Wrap-Up
# --------------------------------------------------------------------------------
print("""
Done!
We've created a robust multi-edge GNN approach using PyTorch Geometric:
- Users, businesses, IPs as node types
- user->user, user->business, user->ip edges (+ reverse edges)
- Optionally multi-task classification for user/business, plus IP classification
- Final test accuracies printed
""")


Done!
We've created a robust multi-edge GNN approach using PyTorch Geometric:
- Users, businesses, IPs as node types
- user->user, user->business, user->ip edges (+ reverse edges)
- Optionally multi-task classification for user/business, plus IP classification
- Final test accuracies printed

