# Notebook: 02-Model-Training-GNN-PyTorch.ipynb

**Purpose**:
- Load .npy-based node features/labels + adjacency from "processed_gnn/"
- Build a PyTorch Geometric HeteroData for "user" & "business" node sets
- Define a 2-layer GNN (HeteroConv + SAGEConv) for user-fraud classification (0=legit,1=fraud)
- Train using user train_mask, track val_mask accuracy, and evaluate test_mask accuracy.

**Key Steps**:
1. Load .npy arrays (features, labels, edges, masks).
2. Construct a PyG HeteroData object with user and business node sets.
3. Define a GNN model with two rounds of message passing.
4. Train the model for several epochs, only backprop on train_mask users.
5. Report validation accuracy per epoch and final test accuracy.

In [1]:
# ---------------------------------------------------------------------------------------
# Cell 1: Imports & Setup
# ---------------------------------------------------------------------------------------
import os
import json
import numpy as np

import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, SAGEConv, Linear

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cpu


In [2]:
# ---------------------------------------------------------------------------------------
# Cell 2: Configuration
# ---------------------------------------------------------------------------------------
# Adjust this path to point to your 'processed_gnn/' directory
PROCESSED_DIR = (
    "/Users/harshil/Development/"
    "GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/"
    "verishield_ml_experiments/data_generators/data-huge/medium_fraud/processed_gnn"
)

HIDDEN_DIM = 64
LEARNING_RATE = 1e-3
EPOCHS = 10
STEPS_PER_EPOCH = 120  # If you want multiple updates per epoch
PRINT_EVERY = 1       # Print validation accuracy every epoch

print(f"Processed GNN data at: {PROCESSED_DIR}")
print(f"HIDDEN_DIM={HIDDEN_DIM}, LR={LEARNING_RATE}, EPOCHS={EPOCHS}, STEPS_PER_EPOCH={STEPS_PER_EPOCH}")

Processed GNN data at: /Users/harshil/Development/GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/verishield_ml_experiments/data_generators/data-huge/medium_fraud/processed_gnn
HIDDEN_DIM=64, LR=0.001, EPOCHS=5, STEPS_PER_EPOCH=10


In [3]:
# ---------------------------------------------------------------------------------------
# Cell 3: Loading .npy Arrays
# ---------------------------------------------------------------------------------------
def load_npy(filename):
    path = os.path.join(PROCESSED_DIR, filename)
    arr = np.load(path)
    print(f"Loaded {filename}, shape={arr.shape}, dtype={arr.dtype}")
    return arr

user_features    = torch.from_numpy(load_npy("user_features.npy")).float()
user_labels      = torch.from_numpy(load_npy("user_labels.npy")).long()
biz_features     = torch.from_numpy(load_npy("biz_features.npy")).float()
biz_labels       = torch.from_numpy(load_npy("biz_labels.npy")).long()  # optional if multi-task

edge_user_user   = torch.from_numpy(load_npy("edge_user_user.npy")).long()  # shape [2, E_uu]
edge_user_biz    = torch.from_numpy(load_npy("edge_user_biz.npy")).long()   # shape [2, E_ub]

train_mask_users = torch.from_numpy(load_npy("train_mask_users.npy")).bool()
val_mask_users   = torch.from_numpy(load_npy("val_mask_users.npy")).bool()
test_mask_users  = torch.from_numpy(load_npy("test_mask_users.npy")).bool()

# Optionally load metadata.json
meta_path = os.path.join(PROCESSED_DIR, "metadata.json")
if os.path.isfile(meta_path):
    with open(meta_path, "r") as f:
        metadata = json.load(f)
    print("\nmetadata.json contents:")
    for k,v in metadata.items():
        print(f"  {k}: {v}")


Loaded user_features.npy, shape=(843973, 7), dtype=float32
Loaded user_labels.npy, shape=(843973,), dtype=int64
Loaded biz_features.npy, shape=(121992, 3), dtype=float32
Loaded biz_labels.npy, shape=(121992,), dtype=int64
Loaded edge_user_user.npy, shape=(2, 84430), dtype=int64
Loaded edge_user_biz.npy, shape=(2, 1852705), dtype=int64
Loaded train_mask_users.npy, shape=(843973,), dtype=bool
Loaded val_mask_users.npy, shape=(843973,), dtype=bool
Loaded test_mask_users.npy, shape=(843973,), dtype=bool

metadata.json contents:
  scenario: medium_fraud
  num_users: 843973
  num_businesses: 121992
  user_feature_cols: ['segment_code', 'is_ring_leader', 'ip_count_log', 'phone_susp', 'email_susp', 'country_watch', 'burst_signup']
  biz_feature_cols: ['watchlist_regctry', 'susp_name_flag', 'biz_age_log']
  do_split: True
  train_ratio: 0.7
  val_ratio: 0.15
  test_ratio: 0.15
  SINGLE_TASK_USER_ONLY: True
  edges_user_user_count: 84430
  edges_user_biz_count: 1852705


In [4]:
# ---------------------------------------------------------------------------------------
# Cell 4: Build a HeteroData Graph
# ---------------------------------------------------------------------------------------
data = HeteroData()

# "user" node set
data['user'].x = user_features           # shape [num_users, feat_dim]
data['user'].y = user_labels            # shape [num_users]
data['user'].train_mask = train_mask_users
data['user'].val_mask   = val_mask_users
data['user'].test_mask  = test_mask_users

# "business" node set
data['business'].x = biz_features
data['business'].y = biz_labels  # if you want multi-task, else can omit

# Edges: user->user
data[('user','user_user','user')].edge_index = edge_user_user

# Edges: user->business
data[('user','user_business','business')].edge_index = edge_user_biz

print("HeteroData object created with the following node_types and edge_types:")
print("Node types:", list(data.node_types))
print("Edge types:", list(data.edge_types))

# Move entire data to GPU (if available)
data = data.to(device)


HeteroData object created with the following node_types and edge_types:
Node types: ['user', 'business']
Edge types: [('user', 'user_user', 'user'), ('user', 'user_business', 'business')]


In [5]:
# ---------------------------------------------------------------------------------------
# Cell 5: Define a 2-layer GNN Model (HeteroConv + SAGEConv)
# ---------------------------------------------------------------------------------------
class UserFraudGNN(torch.nn.Module):
    def __init__(self, hidden_dim, out_dim=2):
        super().__init__()
        # We'll define two "rounds" of HeteroConv
        # Each relation gets a SAGEConv (or any other PyG conv you like)
        self.conv1 = HeteroConv({
            ('user', 'user_user', 'user'): SAGEConv((-1, -1), hidden_dim),
            ('user', 'user_business', 'business'): SAGEConv((-1, -1), hidden_dim),
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('user', 'user_user', 'user'): SAGEConv((-1, -1), hidden_dim),
            ('user', 'user_business', 'business'): SAGEConv((-1, -1), hidden_dim),
        }, aggr='mean')

        # We'll read out from 'user' embeddings for classification
        self.user_lin = Linear(hidden_dim, out_dim)

    def forward(self, x_dict, edge_index_dict):
        # 1st round of message passing
        x_dict = self.conv1(x_dict, edge_index_dict)
        # apply relu
        for node_type, x in x_dict.items():
            x_dict[node_type] = F.relu(x)

        # 2nd round
        x_dict = self.conv2(x_dict, edge_index_dict)
        for node_type, x in x_dict.items():
            x_dict[node_type] = F.relu(x)

        # read out user embeddings => shape [num_users, hidden_dim]
        user_emb = x_dict['user']
        # final linear layer => shape [num_users, 2] for classification
        user_logits = self.user_lin(user_emb)
        return user_logits

# Instantiate the model, move to device
model = UserFraudGNN(HIDDEN_DIM, out_dim=2).to(device)
print(model)


UserFraudGNN(
  (conv1): HeteroConv(num_relations=2)
  (conv2): HeteroConv(num_relations=2)
  (user_lin): Linear(64, 2, bias=True)
)


In [6]:
# ---------------------------------------------------------------------------------------
# Cell 6: Training Setup (Optimizer, Loss, train/eval functions)
# ---------------------------------------------------------------------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

def train_step():
    """One step of training on the entire graph (masked to train users)."""
    model.train()
    optimizer.zero_grad()

    # forward pass: returns [num_users, 2] logits
    user_logits = model(data.x_dict, data.edge_index_dict)

    # gather train mask & labels
    train_mask = data['user'].train_mask
    masked_logits = user_logits[train_mask]
    masked_labels = data['user'].y[train_mask]

    # cross-entropy
    loss = F.cross_entropy(masked_logits, masked_labels)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def evaluate(mask_name='val_mask'):
    """Compute accuracy on either val or test mask (mask_name: 'val_mask' or 'test_mask')."""
    model.eval()
    user_logits = model(data.x_dict, data.edge_index_dict)

    mask = data['user'][mask_name]
    logits_masked = user_logits[mask]
    labels_masked = data['user'].y[mask]

    preds = logits_masked.argmax(dim=-1)
    correct = (preds == labels_masked).sum()
    acc = correct / mask.sum()
    return float(acc)


In [7]:
# ---------------------------------------------------------------------------------------
# Cell 7: Main Training Loop
# ---------------------------------------------------------------------------------------
steps_per_epoch = STEPS_PER_EPOCH

for epoch in range(1, EPOCHS+1):
    avg_loss = 0.0
    for step in range(steps_per_epoch):
        loss_val = train_step()
        avg_loss += loss_val

    avg_loss /= steps_per_epoch
    val_acc = evaluate('val_mask')

    if epoch % PRINT_EVERY == 0:
        print(f"Epoch {epoch}/{EPOCHS} => loss={avg_loss:.4f}, val_acc={val_acc:.4f}")

test_acc = evaluate('test_mask')
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Epoch 1/5 => loss=0.6558, val_acc=0.6780
Epoch 2/5 => loss=0.6392, val_acc=0.6780
Epoch 3/5 => loss=0.6310, val_acc=0.6780
Epoch 4/5 => loss=0.6252, val_acc=0.6780
Epoch 5/5 => loss=0.6209, val_acc=0.6786

Final Test Accuracy: 0.6749


In [8]:
# ---------------------------------------------------------------------------------------
# Cell 8: Wrap-Up
# ---------------------------------------------------------------------------------------
print("""
Done!
We've replicated a 2-layer GNN approach in PyTorch Geometric.
- Built a HeteroData graph with user & business node sets, user->user & user->business edges.
- Trained a user-fraud classifier, applying train/val/test masks to user nodes.
- Printed final test accuracy.
""")



Done!
We've replicated a 2-layer GNN approach in PyTorch Geometric.
- Built a HeteroData graph with user & business node sets, user->user & user->business edges.
- Trained a user-fraud classifier, applying train/val/test masks to user nodes.
- Printed final test accuracy.

