In [1]:
# Jupyter Notebook: 01-GNN-DataPrep.ipynb
# =========================================
# This notebook prepares VeriShield synthetic data for GNN usage.
# By: (Harshil Bhandari / 01-18-2025)

# =====================================================================
# Cell 1: Imports & Global Settings
# =====================================================================
import os
import sys
import json
import numpy as np
import pandas as pd

# For saving PyTorch structures, if desired (optional).
import torch

# If you want to create a PyG 'HeteroData' object, import relevant PyG classes.
# import torch_geometric
# from torch_geometric.data import HeteroData

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

print("Notebook: 01-GNN-DataPrep. Preparing synthetic data for GNN modeling.")

Notebook: 01-GNN-DataPrep. Preparing synthetic data for GNN modeling.


In [2]:
# =====================================================================
# Cell 2: Configuration
# =====================================================================
SCENARIO = "high_fraud"  # "low_fraud", "default", etc.
DATA_BASE_PATH = "/Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/data"
OUTPUT_SUBFOLDER = "processed_gnn"

# If you want to do a user node classification, set True. If multi-task, handle business as well.
SINGLE_TASK_USER_ONLY = True

# If you want to split nodes for train/val/test at the user level:
DO_SPLIT = True

# Example: 70/15/15 split
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# Paths
scenario_path = os.path.join(DATA_BASE_PATH, SCENARIO)
processed_dir = os.path.join(scenario_path, OUTPUT_SUBFOLDER)
os.makedirs(processed_dir, exist_ok=True)

print(f"Scenario path: {scenario_path}")
print(f"Processed output will go to: {processed_dir}")

Scenario path: /Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/data/high_fraud
Processed output will go to: /Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/data/high_fraud/processed_gnn


In [3]:
# =====================================================================
# Cell 3: Load CSVs
# =====================================================================
try:
    df_users = pd.read_csv(os.path.join(scenario_path, "synthetic_users.csv"))
    df_businesses = pd.read_csv(os.path.join(scenario_path, "synthetic_businesses.csv"))
    df_user_biz = pd.read_csv(os.path.join(scenario_path, "user_business_relationships.csv"))
    df_user_user = pd.read_csv(os.path.join(scenario_path, "user_user_relationships.csv"))
    print("DataFrames loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: Could not load one of the main CSVs. {e}")
    sys.exit("Check your scenario folder or file paths.")

print(f"\nShapes:")
print(f"  Users: {df_users.shape}")
print(f"  Businesses: {df_businesses.shape}")
print(f"  User-Biz: {df_user_biz.shape}")
print(f"  User-User: {df_user_user.shape}")

DataFrames loaded successfully.

Shapes:
  Users: (100000, 18)
  Businesses: (10000, 6)
  User-Biz: (220942, 2)
  User-User: (5029, 2)


In [4]:
# =====================================================================
# Cell 4: Basic Validation & Checks
# =====================================================================

# 4.1 Check ID ranges
max_user_id = df_users["user_id"].max()
expected_users = len(df_users)
if max_user_id != expected_users:
    print(f"Warning: The max user_id is {max_user_id}, but we have {expected_users} rows. Possibly okay if data generator used a different approach.")

# Similarly for businesses
max_biz_id = df_businesses["business_id"].max()
expected_biz = len(df_businesses)
if max_biz_id != expected_biz:
    print(f"Warning: The max business_id is {max_biz_id}, but we have {expected_biz} rows. Possibly okay if data generator used a different approach.")

# 4.2 Check user_user edges
bad_uids_uu = df_user_user[ (df_user_user['from_user_id'] < 1) | (df_user_user['from_user_id'] > expected_users) |
                            (df_user_user['to_user_id'] < 1)   | (df_user_user['to_user_id'] > expected_users) ]
if len(bad_uids_uu) > 0:
    print(f"Found {len(bad_uids_uu)} out-of-range user_user edges. Dropping them.")
    df_user_user = df_user_user.drop(bad_uids_uu.index)

# 4.3 Check user_biz edges
bad_uids_ub = df_user_biz[ (df_user_biz['user_id'] < 1) | (df_user_biz['user_id'] > expected_users) ]
bad_bids_ub = df_user_biz[ (df_user_biz['business_id'] < 1) | (df_user_biz['business_id'] > expected_biz) ]
if len(bad_uids_ub) > 0:
    print(f"Found {len(bad_uids_ub)} out-of-range user IDs in user_biz. Dropping them.")
    df_user_biz = df_user_biz.drop(bad_uids_ub.index)
if len(bad_bids_ub) > 0:
    print(f"Found {len(bad_bids_ub)} out-of-range business IDs in user_biz. Dropping them.")
    df_user_biz = df_user_biz.drop(bad_bids_ub.index)

print("\nBasic ID validation done.")


Basic ID validation done.


In [5]:
# =====================================================================
# Cell 5: Feature Engineering
# =====================================================================
# We'll define some helper functions to transform certain columns.

def encode_segment(seg):
    """Return an integer code for each segment string."""
    mapping = {"casual": 0, "smb_owner": 1, "enterprise": 2, "money_mule": 3}
    return mapping.get(seg, 0)

def watchlist_country(ctry):
    """Return 1 if ctry is in watchlist, else 0."""
    watchlist = ["NK", "IR", "SY", "CU", "AF", "SO", "LY"]
    if ctry in watchlist:
        return 1
    return 0

def suspicious_name(bname):
    """Check if the business_name has suspicious keywords."""
    suspicious_keywords = ["test", "fake", "shell", "phantom", "bogus", "shady"]
    bname_lower = str(bname).lower()
    return 1 if any(kw in bname_lower for kw in suspicious_keywords) else 0

# We can create numeric feature arrays for users & businesses.

# 5.1 Users
df_users['segment_code'] = df_users['segment'].apply(encode_segment)
df_users['burst_signup'] = df_users['burst_signup'].astype(int)  # boolean->0/1
df_users['is_ring_leader'] = df_users.get('is_ring_leader', False).astype(int)  # boolean->0/1 if missing
df_users['ip_count_log'] = np.log1p(df_users['ip_count'])  # optional log transform
df_users['country_watch'] = df_users['country_code'].apply(watchlist_country)

# Example: suspicious phone/email signals
def phone_suspicious(phone):
    phone = str(phone)
    if len(phone) < 7:
        return 1
    if '+999' in phone or '666-666' in phone:
        return 1
    return 0

def email_suspicious(email):
    email = str(email).lower()
    return 1 if any(dom in email for dom in ["@tempmail.xyz","@fakemail.com","@guerrillamail.com"]) else 0

df_users['phone_susp'] = df_users['phone'].apply(phone_suspicious)
df_users['email_susp'] = df_users['email'].apply(email_suspicious)

# Fill missing numeric columns with 0
num_cols_users = ['ip_count','ip_count_log','phone_susp','email_susp','country_watch']
for c in num_cols_users:
    df_users[c] = df_users[c].fillna(0)

# 5.2 Businesses
df_businesses['watchlist_regctry'] = df_businesses['registration_country'].apply(watchlist_country)
df_businesses['susp_name_flag'] = df_businesses['business_name'].apply(suspicious_name)

# For biz 'incorporation_date' we can do an 'age_in_days' from a reference:
def days_since_incorp(date):
    if pd.isnull(date):
        return 0
    # or a reference date, say "today"
    ref_date = pd.Timestamp.now()
    delta = ref_date - pd.to_datetime(date)
    return delta.days

df_businesses['biz_age_days'] = df_businesses['incorporation_date'].apply(days_since_incorp)
df_businesses['biz_age_log'] = np.log1p(df_businesses['biz_age_days'])

# Fill missing numeric
num_cols_biz = ['watchlist_regctry','susp_name_flag','biz_age_days','biz_age_log']
for c in num_cols_biz:
    df_businesses[c] = df_businesses[c].fillna(0)

print("\nFeature engineering done. Example user columns now include:", df_users.columns.tolist())
print("Example business columns now include:", df_businesses.columns.tolist())


Feature engineering done. Example user columns now include: ['user_id', 'segment', 'name', 'email', 'username', 'birthdate', 'gender', 'signup_ip', 'device_id', 'phone', 'country_code', 'created_at', 'burst_signup', 'fraud_label', 'is_ring_leader', 'email_domain', 'ip_count', 'num_fraud_biz_owned', 'segment_code', 'ip_count_log', 'country_watch', 'phone_susp', 'email_susp']
Example business columns now include: ['business_id', 'business_name', 'registration_country', 'incorporation_date', 'owner_name', 'fraud_label', 'watchlist_regctry', 'susp_name_flag', 'biz_age_days', 'biz_age_log']


In [6]:
# =====================================================================
# Cell 6: Node ID Assignment & Edge Building
# =====================================================================
num_users = len(df_users)
num_biz = len(df_businesses)

# We'll do 0-based indexing for users, 0-based for businesses, but offset for business if we do a single graph
# If using a heterograph approach, keep them separate.

# 6.1 Create user_idx and biz_idx columns
# user_id in [1..num_users], so subtract 1 to get [0..num_users-1]
df_users['node_id'] = df_users['user_id'] - 1
df_businesses['node_id'] = df_businesses['business_id'] - 1

# 6.2 Build user-user edges
# For ring leaders
df_user_user['from_id_0'] = df_user_user['from_user_id'] - 1
df_user_user['to_id_0'] = df_user_user['to_user_id'] - 1

# If we want them undirected, we might create a second set of edges reversed. We'll show a simple approach:
edges_user_user = []
for idx, row in df_user_user.iterrows():
    f_id = row['from_id_0']
    t_id = row['to_id_0']
    edges_user_user.append((f_id, t_id))
    # Possibly add reversed if you want undirected:
    edges_user_user.append((t_id, f_id))

# 6.3 Build user-business edges
df_user_biz['user_id_0'] = df_user_biz['user_id'] - 1
df_user_biz['biz_id_0'] = df_user_biz['business_id'] - 1

edges_user_biz = []
for idx, row in df_user_biz.iterrows():
    # user is row['user_id_0'], business is row['biz_id_0']
    edges_user_biz.append((row['user_id_0'], row['biz_id_0']))

print(f"\nConstructed {len(edges_user_user)} user-user edges (including possible duplicates for undirected).")
print(f"Constructed {len(edges_user_biz)} user-business edges.")


Constructed 10058 user-user edges (including possible duplicates for undirected).
Constructed 220942 user-business edges.


In [7]:
# =====================================================================
# Cell 7: Creating Feature Arrays & Labels
# =====================================================================
# 7.1 user_features
# We'll pick a subset of columns to represent user features numerically:
# segment_code, is_ring_leader, ip_count_log, phone_susp, email_susp, country_watch, burst_signup, etc.
user_feature_cols = [
    'segment_code','is_ring_leader','ip_count_log','phone_susp',
    'email_susp','country_watch','burst_signup'
]
# Convert to numpy in node_id order
df_users_sorted = df_users.sort_values('node_id')
user_features = df_users_sorted[user_feature_cols].to_numpy(dtype=np.float32)

# user_label (fraud_label)
user_labels = df_users_sorted['fraud_label'].astype(int).to_numpy()

# 7.2 business_features
biz_feature_cols = [
    'watchlist_regctry','susp_name_flag','biz_age_log'
]
df_biz_sorted = df_businesses.sort_values('node_id')
biz_features = df_biz_sorted[biz_feature_cols].to_numpy(dtype=np.float32)

# business fraud label
biz_labels = df_biz_sorted['fraud_label'].astype(int).to_numpy()

print(f"User feature shape: {user_features.shape}, Business feature shape: {biz_features.shape}")

User feature shape: (100000, 7), Business feature shape: (10000, 3)


In [8]:
# =====================================================================
# Cell 8: Optional Train/Val/Test Split for user nodes
# =====================================================================
train_mask_users = None
val_mask_users = None
test_mask_users = None

if DO_SPLIT:
    # We'll do a random split of user node IDs
    user_node_ids = df_users_sorted['node_id'].to_numpy()
    # Shuffle
    np.random.shuffle(user_node_ids)
    n_train = int(TRAIN_RATIO * num_users)
    n_val = int(VAL_RATIO * num_users)
    # n_test = remaining

    train_ids = user_node_ids[:n_train]
    val_ids = user_node_ids[n_train:n_train+n_val]
    test_ids = user_node_ids[n_train+n_val:]

    # We'll create boolean masks for shape [num_users]
    train_mask_users = np.zeros(num_users, dtype=bool)
    val_mask_users = np.zeros(num_users, dtype=bool)
    test_mask_users = np.zeros(num_users, dtype=bool)

    train_mask_users[train_ids] = True
    val_mask_users[val_ids] = True
    test_mask_users[test_ids] = True

    print(f"\nUser node splits: train={train_mask_users.sum()}, val={val_mask_users.sum()}, test={test_mask_users.sum()}")

# If multi-task with business labels, we could do something similar for businesses.


User node splits: train=70000, val=15000, test=15000


In [9]:
# =====================================================================
# Cell 9: Saving Processed Data
# =====================================================================
# We'll store arrays in the processed_gnn folder. 
# You can store them as .npy, .pt, or .pkl, or build a HeteroData object if you prefer PyG.

# Let's do .npy for example:
np.save(os.path.join(processed_dir, "user_features.npy"), user_features)
np.save(os.path.join(processed_dir, "user_labels.npy"), user_labels)
np.save(os.path.join(processed_dir, "biz_features.npy"), biz_features)
np.save(os.path.join(processed_dir, "biz_labels.npy"), biz_labels)

# Edge lists
# We'll store them as arrays of shape [2, num_edges] for potential usage in PyTorch Geometric.
user_user_arr = np.array(edges_user_user, dtype=np.int64).T  # shape (2, E_uu)
user_biz_arr = np.array(edges_user_biz, dtype=np.int64).T    # shape (2, E_ub)

np.save(os.path.join(processed_dir, "edge_user_user.npy"), user_user_arr)
np.save(os.path.join(processed_dir, "edge_user_biz.npy"), user_biz_arr)

# Masks
if DO_SPLIT:
    np.save(os.path.join(processed_dir, "train_mask_users.npy"), train_mask_users)
    np.save(os.path.join(processed_dir, "val_mask_users.npy"), val_mask_users)
    np.save(os.path.join(processed_dir, "test_mask_users.npy"), test_mask_users)

# Optionally store a small metadata .json 
metadata = {
    "scenario": SCENARIO,
    "num_users": num_users,
    "num_businesses": num_biz,
    "user_feature_cols": user_feature_cols,
    "biz_feature_cols": biz_feature_cols,
    "do_split": DO_SPLIT,
    "train_ratio": TRAIN_RATIO,
    "val_ratio": VAL_RATIO,
    "test_ratio": TEST_RATIO,
    "SINGLE_TASK_USER_ONLY": SINGLE_TASK_USER_ONLY,
    "edges_user_user_count": user_user_arr.shape[1],
    "edges_user_biz_count": user_biz_arr.shape[1],
}
with open(os.path.join(processed_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)

print("\nAll processed data saved to:", processed_dir)
print("Data prep complete!")



All processed data saved to: /Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/data/high_fraud/processed_gnn
Data prep complete!
