In [6]:
# Jupyter Notebook: 02-GNN-DataPrep-EDA-1.ipynb
# =============================================
# Purpose:
#  - Thoroughly analyze the processed GNN data in 'processed_gnn/' directory.
#  - Produce text-based summaries (no plots/visualizations).
#  - Validate feature shapes, mask splits, label distributions, etc.
#  - Guide further refinement of GNN architecture choices.
#
# Updated to handle:
#  - IP node arrays (ip_features.npy, ip_labels.npy)
#  - user->ip edges (edge_user_ip.npy)
#  - Optional IP masks (train_mask_ip.npy, etc.)

# =====================================================================
# Cell 1: Imports & Initial Config
# =====================================================================
import os
import json
import numpy as np

print("Notebook: 02-GNN-DataPrep-EDA-1 (Extended for IP).")
print("Analyzing 'processed_gnn' data with user, biz, ip node types (no visuals).")

Notebook: 02-GNN-DataPrep-EDA-1 (Extended for IP).
Analyzing 'processed_gnn' data with user, biz, ip node types (no visuals).


In [7]:
# =====================================================================
# Cell 2: Define Paths & Check Files
# =====================================================================
SCENARIO = "medium_fraud"
BASE_DIR = (
    "/Users/harshil/Development/GitHub_Repos/"
    "VeriShield-AI-Financial-Verification-Platform/"
    "verishield_ml_experiments/data_generators/data"
)
PROCESSED_DIR = os.path.join(BASE_DIR, SCENARIO, "processed_gnn")

print(f"Scenario: {SCENARIO}")
print(f"Looking for processed data in: {PROCESSED_DIR}")

# Expected arrays for user, business, IP
expected_files = [
    "user_features.npy",
    "user_labels.npy",
    "biz_features.npy",
    "biz_labels.npy",
    "ip_features.npy",   # new IP node features
    "ip_labels.npy",     # if you plan IP classification, might be all zeros otherwise
    "edge_user_user.npy",
    "edge_user_biz.npy",
    "edge_user_ip.npy",  # new user->ip edges
    "metadata.json"
]

# Optional mask files for user, biz, ip
optional_files = [
    "train_mask_users.npy", "val_mask_users.npy", "test_mask_users.npy",
    "train_mask_biz.npy",   "val_mask_biz.npy",   "test_mask_biz.npy",
    "train_mask_ip.npy",    "val_mask_ip.npy",    "test_mask_ip.npy"
]

print("\nChecking for required files:")
all_found = True
for ef in expected_files:
    path = os.path.join(PROCESSED_DIR, ef)
    exists = os.path.isfile(path)
    print(f"  {ef:25s} => {'FOUND' if exists else 'MISSING'}")
    if not exists:
        all_found = False

print("\nChecking for optional mask files:")
for of in optional_files:
    path = os.path.join(PROCESSED_DIR, of)
    status = "FOUND" if os.path.isfile(path) else "MISSING"
    print(f"  {of:25s} => {status}")

if not all_found:
    print("\nWARNING: One or more required files are missing. "
          "This EDA might be incomplete.\n")

Scenario: medium_fraud
Looking for processed data in: /Users/harshil/Development/GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/verishield_ml_experiments/data_generators/data/medium_fraud/processed_gnn

Checking for required files:
  user_features.npy         => FOUND
  user_labels.npy           => FOUND
  biz_features.npy          => FOUND
  biz_labels.npy            => FOUND
  ip_features.npy           => FOUND
  ip_labels.npy             => FOUND
  edge_user_user.npy        => FOUND
  edge_user_biz.npy         => FOUND
  edge_user_ip.npy          => FOUND
  metadata.json             => FOUND

Checking for optional mask files:
  train_mask_users.npy      => FOUND
  val_mask_users.npy        => FOUND
  test_mask_users.npy       => FOUND
  train_mask_biz.npy        => MISSING
  val_mask_biz.npy          => MISSING
  test_mask_biz.npy         => MISSING
  train_mask_ip.npy         => FOUND
  val_mask_ip.npy           => FOUND
  test_mask_ip.npy          => FOUND


In [8]:
# =====================================================================
# Cell 3: Load Metadata & Summaries
# =====================================================================
metadata_path = os.path.join(PROCESSED_DIR, "metadata.json")
metadata = {}

if os.path.isfile(metadata_path):
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    print("\nLoaded metadata.json contents:")
    for k, v in metadata.items():
        print(f"  {k}: {v}")
else:
    print("\nNo metadata.json found; skipping metadata checks.")


Loaded metadata.json contents:
  scenario: medium_fraud
  num_users: 100000
  num_businesses: 10000
  num_ips: 5000
  user_feature_cols: ['segment_code', 'is_ring_leader', 'ip_count_log', 'phone_susp', 'email_susp', 'country_watch', 'burst_signup']
  biz_feature_cols: ['watchlist_regctry', 'susp_name_flag', 'biz_age_log']
  ip_feature_cols: ['susp_ip_flag']
  do_split: True
  train_ratio: 0.7
  val_ratio: 0.15
  test_ratio: 0.15
  SINGLE_TASK_USER_ONLY: True
  edges_user_user_count: 10274
  edges_user_biz_count: 221613
  edges_user_ip_count: 100000


In [9]:
# =====================================================================
# Cell 4: Load Core Arrays & Basic Checks
# =====================================================================
def load_array(filename):
    path = os.path.join(PROCESSED_DIR, filename)
    if os.path.isfile(path):
        return np.load(path)
    else:
        return None

user_features = load_array("user_features.npy")
user_labels   = load_array("user_labels.npy")
biz_features  = load_array("biz_features.npy")
biz_labels    = load_array("biz_labels.npy")
ip_features   = load_array("ip_features.npy")
ip_labels     = load_array("ip_labels.npy")

edge_user_user = load_array("edge_user_user.npy")
edge_user_biz  = load_array("edge_user_biz.npy")
edge_user_ip   = load_array("edge_user_ip.npy")

print("\n===== Loaded Core Arrays =====")
if user_features is not None:
    print(f"user_features: shape={user_features.shape}, dtype={user_features.dtype}")
if user_labels is not None:
    print(f"user_labels:   shape={user_labels.shape}, dtype={user_labels.dtype}")

if biz_features is not None:
    print(f"biz_features:  shape={biz_features.shape}, dtype={biz_features.dtype}")
if biz_labels is not None:
    print(f"biz_labels:    shape={biz_labels.shape}, dtype={biz_labels.dtype}")

if ip_features is not None:
    print(f"ip_features:   shape={ip_features.shape}, dtype={ip_features.dtype}")
if ip_labels is not None:
    print(f"ip_labels:     shape={ip_labels.shape}, dtype={ip_labels.dtype}")

if edge_user_user is not None:
    print(f"edge_user_user: shape={edge_user_user.shape}, dtype={edge_user_user.dtype}")
if edge_user_biz is not None:
    print(f"edge_user_biz:  shape={edge_user_biz.shape}, dtype={edge_user_biz.dtype}")
if edge_user_ip is not None:
    print(f"edge_user_ip:   shape={edge_user_ip.shape}, dtype={edge_user_ip.dtype}")


===== Loaded Core Arrays =====
user_features: shape=(100000, 7), dtype=float32
user_labels:   shape=(100000,), dtype=int64
biz_features:  shape=(10000, 3), dtype=float32
biz_labels:    shape=(10000,), dtype=int64
ip_features:   shape=(5000, 1), dtype=float32
ip_labels:     shape=(5000,), dtype=int64
edge_user_user: shape=(2, 10274), dtype=int64
edge_user_biz:  shape=(2, 221613), dtype=int64
edge_user_ip:   shape=(2, 100000), dtype=int64


In [10]:
# =====================================================================
# Cell 5: Check Label Distributions
# =====================================================================
print("\n===== Label Distribution Checks =====")

if user_labels is not None:
    num_users = user_labels.shape[0]
    user_fraud_count = np.sum(user_labels == 1)
    print(f"Users => count={num_users}, fraud_count={user_fraud_count}, "
          f"fraud_ratio={user_fraud_count/num_users:.2%}")

if biz_labels is not None:
    num_biz = biz_labels.shape[0]
    biz_fraud_count = np.sum(biz_labels == 1)
    print(f"Businesses => count={num_biz}, fraud_count={biz_fraud_count}, "
          f"fraud_ratio={biz_fraud_count/num_biz:.2%}")

if ip_labels is not None:
    num_ips = ip_labels.shape[0]
    ip_susp_count = np.sum(ip_labels == 1)  # if 1 means suspicious
    print(f"IPs => count={num_ips}, 'label=1' count={ip_susp_count}, "
          f"ratio={ip_susp_count/num_ips:.2%}")


===== Label Distribution Checks =====
Users => count=100000, fraud_count=74544, fraud_ratio=74.54%
Businesses => count=10000, fraud_count=9683, fraud_ratio=96.83%
IPs => count=5000, 'label=1' count=0, ratio=0.00%


In [11]:
# =====================================================================
# Cell 6: Basic Feature Stats
# =====================================================================
print("\n===== User Features Stats =====")
if user_features is not None:
    num_user_cols = user_features.shape[1]
    for col_idx in range(num_user_cols):
        col_data = user_features[:, col_idx]
        cmin, cmax = np.min(col_data), np.max(col_data)
        cmean, cstd = np.mean(col_data), np.std(col_data)
        print(f"  Col {col_idx}: min={cmin:.3f}, mean={cmean:.3f}, std={cstd:.3f}, max={cmax:.3f}")

print("\n===== Business Features Stats =====")
if biz_features is not None:
    num_biz_cols = biz_features.shape[1]
    for col_idx in range(num_biz_cols):
        col_data = biz_features[:, col_idx]
        cmin, cmax = np.min(col_data), np.max(col_data)
        cmean, cstd = np.mean(col_data), np.std(col_data)
        print(f"  Col {col_idx}: min={cmin:.3f}, mean={cmean:.3f}, std={cstd:.3f}, max={cmax:.3f}")

print("\n===== IP Features Stats =====")
if ip_features is not None:
    num_ip_cols = ip_features.shape[1]
    for col_idx in range(num_ip_cols):
        col_data = ip_features[:, col_idx]
        cmin, cmax = np.min(col_data), np.max(col_data)
        cmean, cstd = np.mean(col_data), np.std(col_data)
        print(f"  Col {col_idx}: min={cmin:.3f}, mean={cmean:.3f}, std={cstd:.3f}, max={cmax:.3f}")


===== User Features Stats =====
  Col 0: min=0.000, mean=0.408, std=0.692, max=3.000
  Col 1: min=0.000, mean=0.005, std=0.071, max=1.000
  Col 2: min=0.693, mean=0.693, std=0.000, max=0.693
  Col 3: min=0.000, mean=0.270, std=0.444, max=1.000
  Col 4: min=0.000, mean=0.000, std=0.000, max=0.000
  Col 5: min=0.000, mean=0.030, std=0.171, max=1.000
  Col 6: min=0.000, mean=0.050, std=0.219, max=1.000

===== Business Features Stats =====
  Col 0: min=0.000, mean=0.030, std=0.172, max=1.000
  Col 1: min=0.000, mean=0.402, std=0.490, max=1.000
  Col 2: min=0.693, mean=7.611, std=0.983, max=8.609

===== IP Features Stats =====
  Col 0: min=0.000, mean=0.000, std=0.000, max=0.000


In [12]:
# =====================================================================
# Cell 7: Edge Analysis
# =====================================================================
print("\n===== Edge Analysis =====")
def edge_summary(edge_arr, name_str):
    if edge_arr is not None:
        shape = edge_arr.shape
        ecount = shape[1] if len(shape) > 1 else 0
        print(f"{name_str}: shape={shape}, total_edges={ecount}")

edge_summary(edge_user_user, "User-User")
edge_summary(edge_user_biz,  "User-Business")
edge_summary(edge_user_ip,   "User-IP")

# Possibly check how many unique users appear in user-ip edges, etc.
if edge_user_ip is not None and ip_labels is not None and user_labels is not None:
    # shape(2, E) => row[0]=user_ids, row[1]=ip_ids
    user_ids_in_ip = np.unique(edge_user_ip[0,:])
    ip_ids_in_ip   = np.unique(edge_user_ip[1,:])
    print(f"  Distinct users in user-ip edges: {len(user_ids_in_ip)} out of {user_labels.shape[0]}")
    print(f"  Distinct IPs   in user-ip edges: {len(ip_ids_in_ip)} out of {ip_labels.shape[0]}")


===== Edge Analysis =====
User-User: shape=(2, 10274), total_edges=10274
User-Business: shape=(2, 221613), total_edges=221613
User-IP: shape=(2, 100000), total_edges=100000
  Distinct users in user-ip edges: 100000 out of 100000
  Distinct IPs   in user-ip edges: 5000 out of 5000


In [13]:
# =====================================================================
# Cell 8: Mask Splits (If Present)
# =====================================================================
print("\n===== Checking Train/Val/Test Masks =====")

mask_files = {
    "train_mask_users": "train_mask_users.npy",
    "val_mask_users":   "val_mask_users.npy",
    "test_mask_users":  "test_mask_users.npy",

    "train_mask_biz":   "train_mask_biz.npy",
    "val_mask_biz":     "val_mask_biz.npy",
    "test_mask_biz":    "test_mask_biz.npy",

    "train_mask_ip":    "train_mask_ip.npy",
    "val_mask_ip":      "val_mask_ip.npy",
    "test_mask_ip":     "test_mask_ip.npy"
}

masks = {}
for key, filename in mask_files.items():
    path = os.path.join(PROCESSED_DIR, filename)
    if os.path.isfile(path):
        masks[key] = np.load(path)
        print(f"  {filename:25s} => FOUND, shape={masks[key].shape}")
    else:
        print(f"  {filename:25s} => MISSING")

def check_mask(mask, label_str, total_count):
    if mask is None:
        return None
    sum_mask = mask.sum()
    print(f"  {label_str} => {sum_mask} / {total_count} = {sum_mask / total_count:.2%}")
    return sum_mask

# Summaries
if user_labels is not None:
    total_users = user_labels.shape[0]
    print("\nUser mask stats:")
    user_train_sum = check_mask(masks.get("train_mask_users"), "train_mask_users", total_users)
    user_val_sum   = check_mask(masks.get("val_mask_users"),   "val_mask_users",   total_users)
    user_test_sum  = check_mask(masks.get("test_mask_users"),  "test_mask_users",  total_users)

if biz_labels is not None:
    total_biz = biz_labels.shape[0]
    print("\nBusiness mask stats:")
    biz_train_sum = check_mask(masks.get("train_mask_biz"), "train_mask_biz", total_biz)
    biz_val_sum   = check_mask(masks.get("val_mask_biz"),   "val_mask_biz",   total_biz)
    biz_test_sum  = check_mask(masks.get("test_mask_biz"),  "test_mask_biz",  total_biz)

if ip_labels is not None:
    total_ip = ip_labels.shape[0]
    print("\nIP mask stats:")
    ip_train_sum = check_mask(masks.get("train_mask_ip"), "train_mask_ip", total_ip)
    ip_val_sum   = check_mask(masks.get("val_mask_ip"),   "val_mask_ip",   total_ip)
    ip_test_sum  = check_mask(masks.get("test_mask_ip"),  "test_mask_ip",  total_ip)


===== Checking Train/Val/Test Masks =====
  train_mask_users.npy      => FOUND, shape=(100000,)
  val_mask_users.npy        => FOUND, shape=(100000,)
  test_mask_users.npy       => FOUND, shape=(100000,)
  train_mask_biz.npy        => MISSING
  val_mask_biz.npy          => MISSING
  test_mask_biz.npy         => MISSING
  train_mask_ip.npy         => FOUND, shape=(5000,)
  val_mask_ip.npy           => FOUND, shape=(5000,)
  test_mask_ip.npy          => FOUND, shape=(5000,)

User mask stats:
  train_mask_users => 70000 / 100000 = 70.00%
  val_mask_users => 15000 / 100000 = 15.00%
  test_mask_users => 15000 / 100000 = 15.00%

Business mask stats:

IP mask stats:
  train_mask_ip => 3500 / 5000 = 70.00%
  val_mask_ip => 750 / 5000 = 15.00%
  test_mask_ip => 750 / 5000 = 15.00%


In [14]:
# =====================================================================
# Cell 9: Additional Analytical Checks
# =====================================================================
print("\n===== Additional Checks / Observations =====")

# 1. Label distributions *within* user train/val/test
if user_labels is not None and "train_mask_users" in masks:
    # train ratio
    tmask = masks["train_mask_users"]
    user_train_fraud_ratio = user_labels[tmask].mean() if tmask is not None else 0
    user_val_fraud_ratio   = user_labels[masks["val_mask_users"]].mean()   if "val_mask_users" in masks and masks["val_mask_users"] is not None else 0
    user_test_fraud_ratio  = user_labels[masks["test_mask_users"]].mean()  if "test_mask_users" in masks and masks["test_mask_users"] is not None else 0

    print(f"User Fraud Ratio => train={user_train_fraud_ratio:.2%}, val={user_val_fraud_ratio:.2%}, test={user_test_fraud_ratio:.2%}")

# 2. Business label distributions in train/val/test
if biz_labels is not None and "train_mask_biz" in masks:
    tmask_biz = masks["train_mask_biz"]
    biz_train_fraud_ratio = biz_labels[tmask_biz].mean() if tmask_biz is not None else 0
    biz_val_fraud_ratio   = biz_labels[masks["val_mask_biz"]].mean()   if "val_mask_biz" in masks else 0
    biz_test_fraud_ratio  = biz_labels[masks["test_mask_biz"]].mean()  if "test_mask_biz" in masks else 0

    print(f"Business Fraud Ratio => train={biz_train_fraud_ratio:.2%}, val={biz_val_fraud_ratio:.2%}, test={biz_test_fraud_ratio:.2%}")

# 3. IP label distributions in train/val/test
if ip_labels is not None and "train_mask_ip" in masks:
    tmask_ip = masks["train_mask_ip"]
    ip_train_label_ratio = ip_labels[tmask_ip].mean() if tmask_ip is not None else 0
    ip_val_label_ratio   = ip_labels[masks["val_mask_ip"]].mean()   if "val_mask_ip" in masks else 0
    ip_test_label_ratio  = ip_labels[masks["test_mask_ip"]].mean()  if "test_mask_ip" in masks else 0

    print(f"IP Label Ratio => train={ip_train_label_ratio:.2%}, val={ip_val_label_ratio:.2%}, test={ip_test_label_ratio:.2%}")


===== Additional Checks / Observations =====
User Fraud Ratio => train=74.61%, val=74.74%, test=74.05%
IP Label Ratio => train=0.00%, val=0.00%, test=0.00%


In [15]:
# =====================================================================
# Cell 10: Conclusions / Next Steps
# =====================================================================
print("\n=== Conclusions & Next Steps ===")
print("1) Validated shapes & distributions for users, businesses, IP nodes.")
print("2) Verified label distribution & mask splits. Possibly balanced or imbalanced.")
print("3) Ready to feed data into a PyG HeteroData object for user->biz->ip GNN experiments.")
print("Done.")



=== Conclusions & Next Steps ===
1) Validated shapes & distributions for users, businesses, IP nodes.
2) Verified label distribution & mask splits. Possibly balanced or imbalanced.
3) Ready to feed data into a PyG HeteroData object for user->biz->ip GNN experiments.
Done.
