In [23]:
# Jupyter Notebook: 01-GNN-DataPrep-2.ipynb
# =========================================
# This notebook prepares VeriShield synthetic data (including IP nodes) for GNN usage.
# By: (Harshil Bhandari / 01-19-2025) - Updated for IP-based expansions

# =====================================================================
# Cell 1: Imports & Global Settings
# =====================================================================
import os
import sys
import json
import numpy as np
import pandas as pd

# For saving PyTorch structures, if desired (optional).
import torch

# If you want to create a PyG 'HeteroData' object, import relevant PyG classes.
# from torch_geometric.data import HeteroData
# import torch_geometric

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

print("Notebook: 01-GNN-DataPrep-2 (Updated). Preparing synthetic data (Users, Businesses, IPs) for GNN modeling.")

Notebook: 01-GNN-DataPrep-2 (Updated). Preparing synthetic data (Users, Businesses, IPs) for GNN modeling.


In [24]:
# =====================================================================
# Cell 2: Configuration
# =====================================================================
# Choose the scenario subfolder
SCENARIO = "medium_fraud"

# Adjust this path to your actual directory where CSVs are generated:
# e.g., verishield_ml_experiments/data_generators/data/<SCENARIO>/
# or data if you used a separate folder for large runs
DATA_BASE_PATH = (
    "/Users/harshil/Development/GitHub_Repos/"
    "VeriShield-AI-Financial-Verification-Platform/"
    "verishield_ml_experiments/data_generators/data"
)

# We'll store processed .npy files in a subfolder
OUTPUT_SUBFOLDER = "processed_gnn"

SINGLE_TASK_USER_ONLY = False  # If false, we'll do multi-task (users & biz)
DO_SPLIT = True               # Create train/val/test splits
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15

scenario_path = os.path.join(DATA_BASE_PATH, SCENARIO)
processed_dir = os.path.join(scenario_path, OUTPUT_SUBFOLDER)
os.makedirs(processed_dir, exist_ok=True)

print(f"Scenario path: {scenario_path}")
print(f"Processed output will go to: {processed_dir}")

Scenario path: /Users/harshil/Development/GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/verishield_ml_experiments/data_generators/data/medium_fraud
Processed output will go to: /Users/harshil/Development/GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/verishield_ml_experiments/data_generators/data/medium_fraud/processed_gnn


In [25]:
# =====================================================================
# Cell 3: Load CSVs (Now includes IP nodes + user_ip rel)
# =====================================================================
try:
    df_users = pd.read_csv(os.path.join(scenario_path, "synthetic_users.csv"))
    df_businesses = pd.read_csv(os.path.join(scenario_path, "synthetic_businesses.csv"))
    df_ip_nodes = pd.read_csv(os.path.join(scenario_path, "ip_nodes.csv"))
    df_user_ip = pd.read_csv(os.path.join(scenario_path, "user_ip_relationships.csv"))
    df_user_biz = pd.read_csv(os.path.join(scenario_path, "user_business_relationships.csv"))
    df_user_user = pd.read_csv(os.path.join(scenario_path, "user_user_relationships.csv"))
    print("DataFrames loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: Could not load one of the CSVs. {e}")
    sys.exit("Check your scenario folder or file paths.")

print(f"\nShapes:")
print(f"  Users:         {df_users.shape}")
print(f"  Businesses:    {df_businesses.shape}")
print(f"  IP Nodes:      {df_ip_nodes.shape}")
print(f"  User->IP:      {df_user_ip.shape}")
print(f"  User->Biz:     {df_user_biz.shape}")
print(f"  User->User:    {df_user_user.shape}")

DataFrames loaded successfully.

Shapes:
  Users:         (100000, 18)
  Businesses:    (10000, 6)
  IP Nodes:      (5000, 2)
  User->IP:      (100000, 2)
  User->Biz:     (221613, 2)
  User->User:    (5137, 2)


In [26]:
# =====================================================================
# Cell 4: Basic Validation & Checks (Now includes IP)
# =====================================================================
expected_users = len(df_users)
expected_biz = len(df_businesses)
expected_ips = len(df_ip_nodes)

max_user_id = df_users["user_id"].max()
if max_user_id != expected_users:
    print(f"Warning: max user_id={max_user_id}, but we have {expected_users} user rows. "
          f"Possible different indexing from the generator?")

max_biz_id = df_businesses["business_id"].max()
if max_biz_id != expected_biz:
    print(f"Warning: max business_id={max_biz_id}, but we have {expected_biz} biz rows.")

max_ip_id = df_ip_nodes["ip_id"].max()
if max_ip_id != expected_ips:
    print(f"Warning: max ip_id={max_ip_id}, but we have {expected_ips} IP rows.")

# --- Validate user_user edges
bad_uids_uu = df_user_user[
    (df_user_user['from_user_id'] < 1) | (df_user_user['from_user_id'] > expected_users) |
    (df_user_user['to_user_id']   < 1) | (df_user_user['to_user_id']   > expected_users)
]
if len(bad_uids_uu) > 0:
    print(f"Found {len(bad_uids_uu)} out-of-range user_user edges. Dropping them.")
    df_user_user = df_user_user.drop(bad_uids_uu.index)

# --- Validate user_biz edges
bad_uids_ub = df_user_biz[
    (df_user_biz['user_id'] < 1) | (df_user_biz['user_id'] > expected_users)
]
bad_bids_ub = df_user_biz[
    (df_user_biz['business_id'] < 1) | (df_user_biz['business_id'] > expected_biz)
]
if len(bad_uids_ub) > 0:
    print(f"Found {len(bad_uids_ub)} out-of-range user IDs in user_biz. Dropping them.")
    df_user_biz = df_user_biz.drop(bad_uids_ub.index)
if len(bad_bids_ub) > 0:
    print(f"Found {len(bad_bids_ub)} out-of-range biz IDs in user_biz. Dropping them.")
    df_user_biz = df_user_biz.drop(bad_bids_ub.index)

# --- Validate user_ip edges
bad_uids_ui = df_user_ip[
    (df_user_ip['user_id'] < 1) | (df_user_ip['user_id'] > expected_users)
]
bad_ipids_ui = df_user_ip[
    (df_user_ip['ip_id'] < 1) | (df_user_ip['ip_id'] > expected_ips)
]
if len(bad_uids_ui) > 0:
    print(f"Found {len(bad_uids_ui)} out-of-range user IDs in user_ip. Dropping them.")
    df_user_ip = df_user_ip.drop(bad_uids_ui.index)
if len(bad_ipids_ui) > 0:
    print(f"Found {len(bad_ipids_ui)} out-of-range ip IDs in user_ip. Dropping them.")
    df_user_ip = df_user_ip.drop(bad_ipids_ui.index)

print("\nBasic ID validation done.")


Basic ID validation done.


In [27]:
# =====================================================================
# Cell 5: Feature Engineering
# =====================================================================
def encode_segment(seg):
    mapping = {"casual": 0, "smb_owner": 1, "enterprise": 2, "money_mule": 3}
    return mapping.get(seg, 0)

def watchlist_country(ctry):
    if isinstance(ctry, str) and ctry.upper() in ["NK","IR","SY","CU","AF","SO","LY"]:
        return 1
    return 0

def suspicious_name(bname):
    suspicious_keywords = ["test", "fake", "shell", "phantom", "bogus", "shady"]
    bname_lower = str(bname).lower()
    return 1 if any(kw in bname_lower for kw in suspicious_keywords) else 0

# -- Feature eng for df_users
df_users['segment_code'] = df_users['segment'].apply(encode_segment).fillna(0).astype(int)
df_users['burst_signup'] = df_users.get('burst_signup', False).astype(int)
df_users['is_ring_leader'] = df_users.get('is_ring_leader', False).astype(int)

# For log transform, handle negative or missing ip_count
df_users['ip_count'] = df_users['ip_count'].fillna(0)
df_users['ip_count'] = df_users['ip_count'].clip(lower=0)
df_users['ip_count_log'] = np.log1p(df_users['ip_count'])

df_users['country_watch'] = df_users['country_code'].fillna('').apply(watchlist_country)

def phone_suspicious(phone):
    phone = str(phone)
    if len(phone) < 7:
        return 1
    if '+999' in phone or '666-666' in phone:
        return 1
    return 0

def email_suspicious(email):
    email = str(email).lower()
    suspicious_doms = ["@tempmail.xyz", "@fakemail.com", "@guerrillamail.com"]
    return 1 if any(dom in email for dom in suspicious_doms) else 0

df_users['phone_susp'] = df_users['phone'].fillna('').apply(phone_suspicious)
df_users['email_susp'] = df_users['email'].fillna('').apply(email_suspicious)

# -- Feature eng for df_businesses
df_businesses['watchlist_regctry'] = df_businesses['registration_country'].fillna('').apply(watchlist_country)
df_businesses['susp_name_flag'] = df_businesses['business_name'].apply(suspicious_name)

def days_since_incorp(date):
    if pd.isnull(date):
        return 0
    ref_date = pd.Timestamp.now()
    delta = ref_date - pd.to_datetime(date)
    return max(delta.days, 0)

df_businesses['biz_age_days'] = df_businesses['incorporation_date'].apply(days_since_incorp)
df_businesses['biz_age_log']  = np.log1p(df_businesses['biz_age_days'].fillna(0))

print("\nFeature engineering done.")
print("User columns now include:", df_users.columns.tolist())
print("Business columns now include:", df_businesses.columns.tolist())


Feature engineering done.
User columns now include: ['user_id', 'segment', 'name', 'email', 'username', 'birthdate', 'gender', 'wave_fraud_boost', 'device_id', 'phone', 'country_code', 'created_at', 'burst_signup', 'fraud_label', 'is_ring_leader', 'email_domain', 'ip_count', 'num_fraud_biz_owned', 'segment_code', 'ip_count_log', 'country_watch', 'phone_susp', 'email_susp']
Business columns now include: ['business_id', 'business_name', 'registration_country', 'incorporation_date', 'owner_name', 'fraud_label', 'watchlist_regctry', 'susp_name_flag', 'biz_age_days', 'biz_age_log']


In [28]:
# =====================================================================
# Cell 6: Node ID Assignment & Edge Building (3 node types: user, biz, ip)
# =====================================================================
num_users = len(df_users)
num_biz   = len(df_businesses)
num_ips   = len(df_ip_nodes)

# user_id => node_id
df_users['node_id'] = df_users['user_id'] - 1
df_businesses['node_id'] = df_businesses['business_id'] - 1
df_ip_nodes['node_id'] = df_ip_nodes['ip_id'] - 1  # if we want IP node indices

# -- Build user->user edges
df_user_user['from_id_0'] = df_user_user['from_user_id'] - 1
df_user_user['to_id_0']   = df_user_user['to_user_id']   - 1

edges_user_user = []
for _, row in df_user_user.iterrows():
    f_id = row['from_id_0']
    t_id = row['to_id_0']
    edges_user_user.append((f_id, t_id))
    edges_user_user.append((t_id, f_id))  # undirected

# -- Build user->biz edges
df_user_biz['user_id_0'] = df_user_biz['user_id'] - 1
df_user_biz['biz_id_0']  = df_user_biz['business_id'] - 1

edges_user_biz = []
for _, row in df_user_biz.iterrows():
    edges_user_biz.append((row['user_id_0'], row['biz_id_0']))

# -- Build user->ip edges
df_user_ip['user_id_0'] = df_user_ip['user_id'] - 1
df_user_ip['ip_id_0']   = df_user_ip['ip_id']   - 1

edges_user_ip = []
for _, row in df_user_ip.iterrows():
    edges_user_ip.append((row['user_id_0'], row['ip_id_0']))

print(f"\nConstructed {len(edges_user_user)} user-user edges (incl duplicates).")
print(f"Constructed {len(edges_user_biz)} user-business edges.")
print(f"Constructed {len(edges_user_ip)} user-ip edges.")


Constructed 10274 user-user edges (incl duplicates).
Constructed 221613 user-business edges.
Constructed 100000 user-ip edges.


In [29]:
# =====================================================================
# Cell 7: Creating Feature Arrays & Labels
# =====================================================================
# For user nodes
user_feature_cols = [
    'segment_code','is_ring_leader','ip_count_log','phone_susp',
    'email_susp','country_watch','burst_signup'
]
df_users_sorted = df_users.sort_values('node_id')
user_features = df_users_sorted[user_feature_cols].to_numpy(dtype=np.float32)
user_labels   = df_users_sorted['fraud_label'].fillna(0).astype(int).to_numpy()

# For business nodes
biz_feature_cols = [
    'watchlist_regctry','susp_name_flag','biz_age_log'
]
df_biz_sorted = df_businesses.sort_values('node_id')
biz_features  = df_biz_sorted[biz_feature_cols].to_numpy(dtype=np.float32)
biz_labels    = df_biz_sorted['fraud_label'].fillna(0).astype(int).to_numpy()

# (Optional) For IP nodes: if you want to do multi-node-type GNN
# We currently have no direct "fraud_label" for IP, but you can create one if desired.
df_ip_sorted = df_ip_nodes.sort_values('node_id')

# If you want IP features, here's a placeholder approach:
# e.g., no advanced feature, just store a "0" or "1" if IP is in a suspicious range
df_ip_sorted['susp_ip_flag'] = 0  # or some logic
ip_feature_cols = ['susp_ip_flag']
ip_features = df_ip_sorted[ip_feature_cols].to_numpy(dtype=np.float32)

# If you want IP labels (not typical unless you have a reason):
ip_labels = np.zeros(len(df_ip_sorted), dtype=int)

print(f"User feature shape: {user_features.shape}, Business feature shape: {biz_features.shape}, IP feature shape: {ip_features.shape}")

User feature shape: (100000, 7), Business feature shape: (10000, 3), IP feature shape: (5000, 1)


In [30]:
# =====================================================================
# Cell 8: Train/Val/Test Splits
# =====================================================================
train_mask_users = None
val_mask_users   = None
test_mask_users  = None

train_mask_biz = None
val_mask_biz   = None
test_mask_biz  = None

# If you want splitting for IP, define them here
train_mask_ip = None
val_mask_ip   = None
test_mask_ip  = None

if DO_SPLIT:
    # -- Users
    user_node_ids = df_users_sorted['node_id'].to_numpy()
    np.random.shuffle(user_node_ids)

    n_train_users = int(TRAIN_RATIO * num_users)
    n_val_users   = int(VAL_RATIO * num_users)

    train_ids_user = user_node_ids[:n_train_users]
    val_ids_user   = user_node_ids[n_train_users : n_train_users + n_val_users]
    test_ids_user  = user_node_ids[n_train_users + n_val_users : ]

    train_mask_users = np.zeros(num_users, dtype=bool)
    val_mask_users   = np.zeros(num_users, dtype=bool)
    test_mask_users  = np.zeros(num_users, dtype=bool)

    train_mask_users[train_ids_user] = True
    val_mask_users[val_ids_user]     = True
    test_mask_users[test_ids_user]   = True

    print(f"\nUser node splits => train={train_mask_users.sum()}, val={val_mask_users.sum()}, test={test_mask_users.sum()}")

    # -- Businesses (multi-task only if SINGLE_TASK_USER_ONLY=False)
    if not SINGLE_TASK_USER_ONLY:
        biz_node_ids = df_biz_sorted['node_id'].to_numpy()
        np.random.shuffle(biz_node_ids)

        n_train_biz = int(TRAIN_RATIO * num_biz)
        n_val_biz   = int(VAL_RATIO * num_biz)

        train_ids_biz = biz_node_ids[:n_train_biz]
        val_ids_biz   = biz_node_ids[n_train_biz : n_train_biz + n_val_biz]
        test_ids_biz  = biz_node_ids[n_train_biz + n_val_biz : ]

        train_mask_biz = np.zeros(num_biz, dtype=bool)
        val_mask_biz   = np.zeros(num_biz, dtype=bool)
        test_mask_biz  = np.zeros(num_biz, dtype=bool)

        train_mask_biz[train_ids_biz] = True
        val_mask_biz[val_ids_biz]     = True
        test_mask_biz[test_ids_biz]   = True

        print(f"Business node splits => train={train_mask_biz.sum()}, val={val_mask_biz.sum()}, test={test_mask_biz.sum()}")

    # -- IP (Optional)
    # If you want to classify IPs as suspicious or not, define a label. Then do a split:
    # e.g.:
    ip_node_ids = df_ip_sorted['node_id'].to_numpy()
    np.random.shuffle(ip_node_ids)
    n_train_ips = int(TRAIN_RATIO * num_ips)
    n_val_ips   = int(VAL_RATIO * num_ips)

    train_ids_ip = ip_node_ids[:n_train_ips]
    val_ids_ip   = ip_node_ids[n_train_ips : n_train_ips + n_val_ips]
    test_ids_ip  = ip_node_ids[n_train_ips + n_val_ips : ]

    train_mask_ip = np.zeros(num_ips, dtype=bool)
    val_mask_ip   = np.zeros(num_ips, dtype=bool)
    test_mask_ip  = np.zeros(num_ips, dtype=bool)

    train_mask_ip[train_ids_ip] = True
    val_mask_ip[val_ids_ip]     = True
    test_mask_ip[test_ids_ip]   = True

    print(f"IP node splits => train={train_mask_ip.sum()}, val={val_mask_ip.sum()}, test={test_mask_ip.sum()}")


User node splits => train=70000, val=15000, test=15000
Business node splits => train=7000, val=1500, test=1500
IP node splits => train=3500, val=750, test=750


In [31]:
# =====================================================================
# Cell 9: Saving Processed Data
# =====================================================================
# 1) Node features + labels
np.save(os.path.join(processed_dir, "user_features.npy"), user_features)
np.save(os.path.join(processed_dir, "user_labels.npy"),   user_labels)
np.save(os.path.join(processed_dir, "biz_features.npy"),  biz_features)
np.save(os.path.join(processed_dir, "biz_labels.npy"),    biz_labels)
# IP features + labels (if you're doing multi-node-type GNN):
np.save(os.path.join(processed_dir, "ip_features.npy"),  ip_features)
np.save(os.path.join(processed_dir, "ip_labels.npy"),    ip_labels)  # zero by default

# 2) Edges
user_user_arr = np.array(edges_user_user, dtype=np.int64).T  # shape (2, E_uu)
user_biz_arr  = np.array(edges_user_biz,  dtype=np.int64).T  # shape (2, E_ub)
user_ip_arr   = np.array(edges_user_ip,  dtype=np.int64).T   # shape (2, E_ui)

np.save(os.path.join(processed_dir, "edge_user_user.npy"), user_user_arr)
np.save(os.path.join(processed_dir, "edge_user_biz.npy"),  user_biz_arr)
np.save(os.path.join(processed_dir, "edge_user_ip.npy"),   user_ip_arr)

# 3) Masks
if DO_SPLIT:
    # user masks
    np.save(os.path.join(processed_dir, "train_mask_users.npy"), train_mask_users)
    np.save(os.path.join(processed_dir, "val_mask_users.npy"),   val_mask_users)
    np.save(os.path.join(processed_dir, "test_mask_users.npy"),  test_mask_users)

    # business masks
    if not SINGLE_TASK_USER_ONLY and train_mask_biz is not None:
        np.save(os.path.join(processed_dir, "train_mask_biz.npy"), train_mask_biz)
        np.save(os.path.join(processed_dir, "val_mask_biz.npy"),   val_mask_biz)
        np.save(os.path.join(processed_dir, "test_mask_biz.npy"),  test_mask_biz)

    # ip masks
    # only relevant if you want to do IP classification
    if train_mask_ip is not None:
        np.save(os.path.join(processed_dir, "train_mask_ip.npy"), train_mask_ip)
        np.save(os.path.join(processed_dir, "val_mask_ip.npy"),   val_mask_ip)
        np.save(os.path.join(processed_dir, "test_mask_ip.npy"),  test_mask_ip)

# 4) Metadata JSON
metadata = {
    "scenario": SCENARIO,
    "num_users": num_users,
    "num_businesses": num_biz,
    "num_ips": num_ips,
    "user_feature_cols": user_feature_cols,
    "biz_feature_cols": biz_feature_cols,
    "ip_feature_cols": ip_feature_cols,  # if you want to track them
    "do_split": DO_SPLIT,
    "train_ratio": TRAIN_RATIO,
    "val_ratio": VAL_RATIO,
    "test_ratio": TEST_RATIO,
    "SINGLE_TASK_USER_ONLY": SINGLE_TASK_USER_ONLY,
    "edges_user_user_count": user_user_arr.shape[1],
    "edges_user_biz_count":  user_biz_arr.shape[1],
    "edges_user_ip_count":   user_ip_arr.shape[1],
}

with open(os.path.join(processed_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)

print("\nAll processed data saved to:", processed_dir)
print("Data prep complete (with IP nodes & user->ip relationships)!")



All processed data saved to: /Users/harshil/Development/GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/verishield_ml_experiments/data_generators/data/medium_fraud/processed_gnn
Data prep complete (with IP nodes & user->ip relationships)!
