In [1]:
# Jupyter Notebook: 02-GNN-DataPrep-EDA-1.ipynb
# =============================================
# Purpose:
#  - Thoroughly analyze the processed GNN data in 'processed_gnn/' directory.
#  - Produce text-based summaries (no plots/visualizations).
#  - Validate feature shapes, mask splits, label distributions, etc.
#  - Guide further refinement of GNN architecture choices.

# =====================================================================
# Cell 1: Imports & Initial Config
# =====================================================================
import os
import json
import numpy as np

# We use 'print' for text-based outputs; 
# no matplotlib/seaborn to maintain a purely "analytical" text notebook.

print("Notebook: 02-GNN-DataPrep-EDA-1")
print("Analyzing processed GNN data in 'default/processed_gnn' directory (no visuals).")

Notebook: 02-GNN-DataPrep-EDA-1
Analyzing processed GNN data in 'default/processed_gnn' directory (no visuals).


In [2]:
# =====================================================================
# Cell 2: Define Paths & Check Files
# =====================================================================
# Adjust these paths according to your local environment if needed.
# Typically, we rely on the structure:
# verishield_ml_experiments/data_generators/data/<SCENARIO>/processed_gnn/

SCENARIO = "default"
BASE_DIR = (
    "/Users/harshil/Development/GitHub_Repos/"
    "VeriShield-AI-Financial-Verification-Platform/"
    "verishield_ml_experiments/data_generators/data-huge"
)
PROCESSED_DIR = os.path.join(BASE_DIR, SCENARIO, "processed_gnn")

print(f"Scenario: {SCENARIO}")
print(f"Looking for processed data in: {PROCESSED_DIR}")

# List expected files
expected_files = [
    "user_features.npy",
    "user_labels.npy",
    "biz_features.npy",
    "biz_labels.npy",
    "edge_user_user.npy",
    "edge_user_biz.npy",
    "metadata.json"
]

# We'll see if train/val/test masks exist (both user & biz).
optional_files = [
    "train_mask_users.npy",
    "val_mask_users.npy",
    "test_mask_users.npy",
    "train_mask_biz.npy",
    "val_mask_biz.npy",
    "test_mask_biz.npy"
]

all_found = True
print("\nChecking for required files:")
for ef in expected_files:
    path = os.path.join(PROCESSED_DIR, ef)
    exists = os.path.isfile(path)
    print(f"  {ef:30s} => {'FOUND' if exists else 'MISSING'}")
    if not exists:
        all_found = False

print("\nChecking for optional mask files:")
for of in optional_files:
    path = os.path.join(PROCESSED_DIR, of)
    exists = os.path.isfile(path)
    status = "FOUND" if exists else "MISSING"
    print(f"  {of:30s} => {status}")

if not all_found:
    print("\nWARNING: One or more required files are missing. "
          "This EDA might be incomplete.\n")

Scenario: default
Looking for processed data in: /Users/harshil/Development/GitHub_Repos/VeriShield-AI-Financial-Verification-Platform/verishield_ml_experiments/data_generators/data-huge/default/processed_gnn

Checking for required files:
  user_features.npy              => FOUND
  user_labels.npy                => FOUND
  biz_features.npy               => FOUND
  biz_labels.npy                 => FOUND
  edge_user_user.npy             => FOUND
  edge_user_biz.npy              => FOUND
  metadata.json                  => FOUND

Checking for optional mask files:
  train_mask_users.npy           => FOUND
  val_mask_users.npy             => FOUND
  test_mask_users.npy            => FOUND
  train_mask_biz.npy             => MISSING
  val_mask_biz.npy               => MISSING
  test_mask_biz.npy              => MISSING


In [3]:
# =====================================================================
# Cell 3: Load Metadata & Summaries
# =====================================================================
metadata_path = os.path.join(PROCESSED_DIR, "metadata.json")
metadata = {}

if os.path.isfile(metadata_path):
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    print("\nLoaded metadata.json contents:")
    for k, v in metadata.items():
        print(f"  {k}: {v}")
else:
    print("\nNo metadata.json found; skipping metadata checks.")


Loaded metadata.json contents:
  scenario: default
  num_users: 453738
  num_businesses: 69040
  user_feature_cols: ['segment_code', 'is_ring_leader', 'ip_count_log', 'phone_susp', 'email_susp', 'country_watch', 'burst_signup']
  biz_feature_cols: ['watchlist_regctry', 'susp_name_flag', 'biz_age_log']
  do_split: True
  train_ratio: 0.7
  val_ratio: 0.15
  test_ratio: 0.15
  SINGLE_TASK_USER_ONLY: True
  edges_user_user_count: 45646
  edges_user_biz_count: 996203


In [4]:
# =====================================================================
# Cell 4: Load Core Arrays & Basic Checks
# =====================================================================
def load_array(filename):
    path = os.path.join(PROCESSED_DIR, filename)
    if os.path.isfile(path):
        return np.load(path)
    else:
        return None

user_features = load_array("user_features.npy")
user_labels   = load_array("user_labels.npy")
biz_features  = load_array("biz_features.npy")
biz_labels    = load_array("biz_labels.npy")

edge_user_user = load_array("edge_user_user.npy")
edge_user_biz  = load_array("edge_user_biz.npy")

print("\n===== Loaded Core Arrays =====")
if user_features is not None:
    print(f"user_features: shape={user_features.shape}, dtype={user_features.dtype}")
if user_labels is not None:
    print(f"user_labels:   shape={user_labels.shape}, dtype={user_labels.dtype}")
if biz_features is not None:
    print(f"biz_features:  shape={biz_features.shape}, dtype={biz_features.dtype}")
if biz_labels is not None:
    print(f"biz_labels:    shape={biz_labels.shape}, dtype={biz_labels.dtype}")

if edge_user_user is not None:
    print(f"edge_user_user: shape={edge_user_user.shape}, dtype={edge_user_user.dtype}")
if edge_user_biz is not None:
    print(f"edge_user_biz:  shape={edge_user_biz.shape}, dtype={edge_user_biz.dtype}")


===== Loaded Core Arrays =====
user_features: shape=(453738, 7), dtype=float32
user_labels:   shape=(453738,), dtype=int64
biz_features:  shape=(69040, 3), dtype=float32
biz_labels:    shape=(69040,), dtype=int64
edge_user_user: shape=(2, 45646), dtype=int64
edge_user_biz:  shape=(2, 996203), dtype=int64


In [5]:
# =====================================================================
# Cell 5: Check Label Distributions
# =====================================================================
print("\n===== Label Distribution Checks =====")

if user_labels is not None:
    num_users = user_labels.shape[0]
    user_fraud_count = np.sum(user_labels == 1)
    print(f"Users => count={num_users}, fraud_count={user_fraud_count}, "
          f"fraud_ratio={user_fraud_count/num_users:.2%}")

if biz_labels is not None:
    num_biz = biz_labels.shape[0]
    biz_fraud_count = np.sum(biz_labels == 1)
    print(f"Businesses => count={num_biz}, fraud_count={biz_fraud_count}, "
          f"fraud_ratio={biz_fraud_count/num_biz:.2%}")


===== Label Distribution Checks =====
Users => count=453738, fraud_count=98738, fraud_ratio=21.76%
Businesses => count=69040, fraud_count=32561, fraud_ratio=47.16%


In [6]:
# =====================================================================
# Cell 6: Explore Basic Features
# =====================================================================
print("\n===== User Features Stats =====")

if user_features is not None and user_labels is not None:
    # Let's do quick min/mean/max per column
    num_user_cols = user_features.shape[1]
    for col_idx in range(num_user_cols):
        col_data = user_features[:, col_idx]
        cmin = np.min(col_data)
        cmax = np.max(col_data)
        cmean = np.mean(col_data)
        cstd = np.std(col_data)
        print(f"  Column {col_idx}: min={cmin:.3f}, mean={cmean:.3f}, std={cstd:.3f}, max={cmax:.3f}")

print("\n===== Business Features Stats =====")
if biz_features is not None and biz_labels is not None:
    num_biz_cols = biz_features.shape[1]
    for col_idx in range(num_biz_cols):
        col_data = biz_features[:, col_idx]
        cmin = np.min(col_data)
        cmax = np.max(col_data)
        cmean = np.mean(col_data)
        cstd = np.std(col_data)
        print(f"  Column {col_idx}: min={cmin:.3f}, mean={cmean:.3f}, std={cstd:.3f}, max={cmax:.3f}")


===== User Features Stats =====
  Column 0: min=0.000, mean=0.411, std=0.696, max=3.000
  Column 1: min=0.000, mean=0.005, std=0.071, max=1.000
  Column 2: min=0.693, mean=1.596, std=1.807, max=5.394
  Column 3: min=0.000, mean=0.268, std=0.443, max=1.000
  Column 4: min=0.000, mean=0.000, std=0.000, max=0.000
  Column 5: min=0.000, mean=0.030, std=0.171, max=1.000
  Column 6: min=0.000, mean=0.050, std=0.218, max=1.000

===== Business Features Stats =====
  Column 0: min=0.000, mean=0.031, std=0.172, max=1.000
  Column 1: min=0.000, mean=0.401, std=0.490, max=1.000
  Column 2: min=0.693, mean=7.613, std=0.994, max=8.609


In [7]:
# =====================================================================
# Cell 7: Edge Analysis
# =====================================================================
print("\n===== Edge Analysis =====")
if edge_user_user is not None:
    euu_shape = edge_user_user.shape
    euu_count = euu_shape[1] if len(euu_shape) > 1 else 0
    print(f"User-User edges => shape={euu_shape}, total_edges={euu_count}")
    # Possibly check a quick "unique" edge count ignoring duplicates:
    # We skip it if we prefer purely text-based quick checks.

if edge_user_biz is not None:
    eub_shape = edge_user_biz.shape
    eub_count = eub_shape[1] if len(eub_shape) > 1 else 0
    print(f"User-Business edges => shape={eub_shape}, total_edges={eub_count}")

# Optionally check how many unique users appear in user-biz edges, etc.
if edge_user_biz is not None and user_labels is not None and biz_labels is not None:
    # shape(2, E) => row[0]=user_ids, row[1]=biz_ids
    user_ids_in_biz = np.unique(edge_user_biz[0,:])
    biz_ids_in_biz  = np.unique(edge_user_biz[1,:])
    print(f"  Distinct users in user-business edges: {len(user_ids_in_biz)} out of {user_labels.shape[0]}")
    print(f"  Distinct businesses in user-business edges: {len(biz_ids_in_biz)} out of {biz_labels.shape[0]}")


===== Edge Analysis =====
User-User edges => shape=(2, 45646), total_edges=45646
User-Business edges => shape=(2, 996203), total_edges=996203
  Distinct users in user-business edges: 181139 out of 453738
  Distinct businesses in user-business edges: 69040 out of 69040


In [8]:
# =====================================================================
# Cell 8: Mask Splits (If Present)
# =====================================================================
print("\n===== Checking Train/Val/Test Masks =====")
train_mask_users = load_array("train_mask_users.npy")
val_mask_users   = load_array("val_mask_users.npy")
test_mask_users  = load_array("test_mask_users.npy")

train_mask_biz = load_array("train_mask_biz.npy")
val_mask_biz   = load_array("val_mask_biz.npy")
test_mask_biz  = load_array("test_mask_biz.npy")

def check_mask(mask, label_str):
    if mask is None:
        print(f"  {label_str} => MISSING")
        return 0
    return mask.sum()

if train_mask_users is not None:
    user_train_count = check_mask(train_mask_users, "train_mask_users")
    user_val_count = check_mask(val_mask_users,   "val_mask_users")
    user_test_count = check_mask(test_mask_users, "test_mask_users")
    total_users = user_labels.shape[0] if user_labels is not None else 0
    print(f"Users => train={user_train_count}, val={user_val_count}, test={user_test_count}, total={total_users}")
    if total_users > 0:
        sum_split = user_train_count + user_val_count + user_test_count
        print(f"  Sum of splits vs total: {sum_split} / {total_users}")

if train_mask_biz is not None:
    biz_train_count = check_mask(train_mask_biz, "train_mask_biz")
    biz_val_count = check_mask(val_mask_biz,     "val_mask_biz")
    biz_test_count = check_mask(test_mask_biz,   "test_mask_biz")
    total_biz = biz_labels.shape[0] if biz_labels is not None else 0
    print(f"Businesses => train={biz_train_count}, val={biz_val_count}, test={biz_test_count}, total={total_biz}")
    if total_biz > 0:
        sum_split_biz = biz_train_count + biz_val_count + biz_test_count
        print(f"  Sum of splits vs total: {sum_split_biz} / {total_biz}")
else:
    print("No business train/val/test masks found or SINGLE_TASK_USER_ONLY is True.")


===== Checking Train/Val/Test Masks =====
Users => train=317616, val=68060, test=68062, total=453738
  Sum of splits vs total: 453738 / 453738
No business train/val/test masks found or SINGLE_TASK_USER_ONLY is True.


In [9]:
# =====================================================================
# Cell 9: Additional Analytical Checks
# =====================================================================
print("\n===== Additional Checks / Observations =====")

# 1. Possibly check label distributions *within* train/val/test for users
if train_mask_users is not None and user_labels is not None:
    # user training fraud ratio
    user_labels_train = user_labels[train_mask_users]
    ratio_train = np.mean(user_labels_train)
    ratio_val   = np.mean(user_labels[val_mask_users])   if val_mask_users   is not None else 0
    ratio_test  = np.mean(user_labels[test_mask_users])  if test_mask_users  is not None else 0
    print(f"User label ratio in train={ratio_train:.2%}, val={ratio_val:.2%}, test={ratio_test:.2%}")

# 2. Similarly for businesses if multi-task
if train_mask_biz is not None and biz_labels is not None:
    biz_labels_train = biz_labels[train_mask_biz]
    ratio_train_b = np.mean(biz_labels_train)
    ratio_val_b   = np.mean(biz_labels[val_mask_biz])   if val_mask_biz   is not None else 0
    ratio_test_b  = np.mean(biz_labels[test_mask_biz])  if test_mask_biz  is not None else 0
    print(f"Business label ratio in train={ratio_train_b:.2%}, val={ratio_val_b:.2%}, test={ratio_test_b:.2%}")

print("\nNo visualizations here—purely text-based summaries. Analysis complete.")


===== Additional Checks / Observations =====
User label ratio in train=21.75%, val=21.75%, test=21.84%

No visualizations here—purely text-based summaries. Analysis complete.


In [10]:
# =====================================================================
# Cell 10: Conclusions / Next Steps
# =====================================================================
print("\n=== Conclusions & Next Steps ===")
print("1) We've confirmed shapes of features/labels/edges. Possibly large user-biz edges if scenario=default.")
print("2) Observed label distributions. High business fraud ratio is typical for 'default'.")
print("3) Checked mask splits to ensure train/val/test sums match total nodes.")
print("\nRecommended next step: proceed to building a PyG HeteroData object and define a multi-task GNN, if needed.")
print("Done.")



=== Conclusions & Next Steps ===
1) We've confirmed shapes of features/labels/edges. Possibly large user-biz edges if scenario=default.
2) Observed label distributions. High business fraud ratio is typical for 'default'.
3) Checked mask splits to ensure train/val/test sums match total nodes.

Recommended next step: proceed to building a PyG HeteroData object and define a multi-task GNN, if needed.
Done.
