In [1]:
# 01-EDA-3.ipynb

# =====================================================================
# Cell 1: Imports and Global Settings
# =====================================================================
import pandas as pd

# Set pandas to show all columns in text outputs
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)  # limit row prints to avoid huge spam

print("Notebook: 01-EDA-3 for synthetic VeriShield data (multi-pass + ring leaders).")

Notebook: 01-EDA-3 for synthetic VeriShield data (multi-pass + ring leaders).


In [2]:
# =====================================================================
# Cell 2: Load CSVs
# =====================================================================
# Paths (update as needed)
PATH_USERS = "/Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/synthetic_users.csv"
PATH_BUSINESSES = "/Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/synthetic_businesses.csv"
PATH_RELATIONSHIPS = "/Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/user_business_relationships.csv"
PATH_USERUSER = "/Users/harshil/Development/personal_projects/VeriShield-ML-Experiments/data_generators/user_user_relationships.csv"  # new user-user CSV

# Load main 3 CSVs
df_users = pd.read_csv(PATH_USERS)
df_businesses = pd.read_csv(PATH_BUSINESSES)
df_relationships = pd.read_csv(PATH_RELATIONSHIPS)

# Attempt to load user-user relationships
try:
    df_user_user = pd.read_csv(PATH_USERUSER)
    print("User-User relationships loaded.")
except FileNotFoundError:
    print("No 'user_user_relationships.csv' found; skipping user-user analysis.")
    df_user_user = pd.DataFrame()

print("\nDataFrames loaded:")
print(f"  Users shape: {df_users.shape}")
print(f"  Businesses shape: {df_businesses.shape}")
print(f"  User-Biz shape: {df_relationships.shape}")
if not df_user_user.empty:
    print(f"  User-User shape: {df_user_user.shape}")

User-User relationships loaded.

DataFrames loaded:
  Users shape: (100000, 18)
  Businesses shape: (10000, 6)
  User-Biz shape: (220942, 2)
  User-User shape: (5029, 2)


In [3]:
# =====================================================================
# Cell 2.1: (Optional) Quick look at user-user relationships
# =====================================================================
if not df_user_user.empty:
    print("\n=== User-User: .info() ===")
    df_user_user.info()

    print("\n=== User-User .head(5) ===")
    print(df_user_user.head(5))

    # Basic stats about distinct users in from->to
    distinct_from = df_user_user['from_user_id'].nunique()
    distinct_to = df_user_user['to_user_id'].nunique()
    print(f"\nDistinct 'from_user_id': {distinct_from}")
    print(f"Distinct 'to_user_id': {distinct_to}")

    # If 'is_ring_leader' column exists in df_users, see how many ring leaders we have
    if 'is_ring_leader' in df_users.columns:
        ring_leaders_count = df_users['is_ring_leader'].sum()
        print(f"Ring Leaders present: {ring_leaders_count}")
    else:
        print("No 'is_ring_leader' column in df_users. Skipping ring leader analysis.")


=== User-User: .info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5029 entries, 0 to 5028
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   from_user_id  5029 non-null   int64
 1   to_user_id    5029 non-null   int64
dtypes: int64(2)
memory usage: 78.7 KB

=== User-User .head(5) ===
   from_user_id  to_user_id
0         35220       65492
1         35220       14511
2         35220       77336
3         35220       64297
4         35220       28547

Distinct 'from_user_id': 500
Distinct 'to_user_id': 4919
Ring Leaders present: 500


In [4]:
# =====================================================================
# Cell 3: Inspect df_users
# =====================================================================
print("\n=== Users: .info() ===")
df_users.info()

print("\n=== Users: .head(5) ===")
print(df_users.head(5))

print("\n=== Users: .describe() (numeric/time fields) ===")
print(df_users.describe(include=[int, float, 'datetime']))

print("\n=== Users: .describe(include='object') ===")
print(df_users.describe(include=[object]))

print("\n=== Users: Checking for missing values by column ===")
print(df_users.isnull().sum())

print("\n=== Users: Fraud Label Counts ===")
if 'fraud_label' in df_users.columns:
    fraud_label_counts = df_users['fraud_label'].value_counts(dropna=False)
    print(fraud_label_counts)
    # FRAUD RATIO
    fraud_ratio_users = (df_users['fraud_label'] == 1).mean()
    print(f"Users Fraud Ratio: {fraud_ratio_users:.4%}")
else:
    print("'fraud_label' not found in df_users.")

# Segment check
print("\n=== Users: Segment Distribution ===")
if 'segment' in df_users.columns:
    print(df_users['segment'].value_counts(dropna=False))
else:
    print("No 'segment' column found in df_users.")

# Ring leader check
print("\n=== Users: is_ring_leader Distribution ===")
if 'is_ring_leader' in df_users.columns:
    print(df_users['is_ring_leader'].value_counts(dropna=False))
else:
    print("No 'is_ring_leader' column found in df_users.")

# Country code frequency
print("\n=== Users: Sample Country Code Frequency (top 10) ===")
if 'country_code' in df_users.columns:
    print(df_users['country_code'].value_counts(dropna=False).head(10))
else:
    print("No 'country_code' column found in df_users.")

# Gender frequency
print("\n=== Users: Gender Frequency (if applicable) ===")
if 'gender' in df_users.columns:
    print(df_users['gender'].value_counts(dropna=False))
else:
    print("No 'gender' column found.")

# Top IPs
print("\n=== Users: Top 10 IPs by frequency ===")
if 'signup_ip' in df_users.columns:
    ip_counts = df_users['signup_ip'].value_counts().head(10)
    print(ip_counts)
else:
    print("No 'signup_ip' column found in df_users.")

# Email domain check
print("\n=== Users: Top 10 Email Domains ===")
if 'email_domain' in df_users.columns:
    print(df_users['email_domain'].value_counts().head(10))
elif 'email' in df_users.columns:
    # derive quickly
    df_users['email_domain'] = df_users['email'].apply(
        lambda x: x.split('@')[-1] if x and '@' in str(x) else 'missing'
    )
    print(df_users['email_domain'].value_counts().head(10))
else:
    print("No 'email' or 'email_domain' column found in df_users.")

# Device ID
print("\n=== Users: Checking for 'device_id' ===")
if 'device_id' in df_users.columns:
    print("Sample device_id counts (top 10):")
    print(df_users['device_id'].value_counts().head(10))
else:
    print("No 'device_id' column found in df_users.")

# Burst signup
print("\n=== Users: Checking for 'burst_signup' ===")
if 'burst_signup' in df_users.columns:
    print(df_users['burst_signup'].value_counts(dropna=False))
else:
    print("No 'burst_signup' column found in df_users.")

# Additional derived features
if 'num_fraud_biz_owned' in df_users.columns:
    print("\n=== Users: num_fraud_biz_owned distribution ===")
    print(df_users['num_fraud_biz_owned'].describe())
else:
    print("No 'num_fraud_biz_owned' column found in df_users.")

if 'ip_count' in df_users.columns:
    print("\n=== Users: ip_count distribution ===")
    print(df_users['ip_count'].describe())
else:
    print("No 'ip_count' column found in df_users.")


=== Users: .info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user_id              100000 non-null  int64  
 1   segment              100000 non-null  object 
 2   name                 98055 non-null   object 
 3   email                98019 non-null   object 
 4   username             100000 non-null  object 
 5   birthdate            100000 non-null  object 
 6   gender               100000 non-null  object 
 7   signup_ip            100000 non-null  object 
 8   device_id            100000 non-null  object 
 9   phone                97923 non-null   object 
 10  country_code         97475 non-null   object 
 11  created_at           100000 non-null  object 
 12  burst_signup         100000 non-null  bool   
 13  fraud_label          100000 non-null  int64  
 14  is_ring_leader       100000 non-null  bool   

In [5]:
# =====================================================================
# Cell 4: More advanced grouping on df_users
# =====================================================================
print("\n=== Group by 'fraud_label' and 'country_code' (top 10 combos) ===")
if 'fraud_label' in df_users.columns and 'country_code' in df_users.columns:
    user_grouped = df_users.groupby(['fraud_label', 'country_code']).size().reset_index(name='count')
    user_grouped_sorted = user_grouped.sort_values('count', ascending=False).head(10)
    print(user_grouped_sorted)
else:
    print("Skipping group by (fraud_label, country_code). Missing columns.")

# Possibly check ring_leader vs. fraud_label
if 'fraud_label' in df_users.columns and 'is_ring_leader' in df_users.columns:
    print("\n=== Cross-tab: ring_leader vs. fraud_label ===")
    ctab_leader = pd.crosstab(df_users['is_ring_leader'], df_users['fraud_label'], margins=True)
    print(ctab_leader)


=== Group by 'fraud_label' and 'country_code' (top 10 combos) ===
     fraud_label country_code  count
44             0           DJ    316
138            0           PK    316
14             0           BE    310
186            0           VE    308
161            0           SR    308
77             0           IL    302
137            0           PH    302
154            0           SG    302
95             0           KZ    301
41             0           CY    301

=== Cross-tab: ring_leader vs. fraud_label ===
fraud_label         0      1     All
is_ring_leader                      
False           54378  45122   99500
True              146    354     500
All             54524  45476  100000


In [6]:
# =====================================================================
# Cell 4.1: Checking IP collisions
# =====================================================================
if 'signup_ip' in df_users.columns and 'fraud_label' in df_users.columns:
    print("\n=== Group by 'fraud_label' and 'signup_ip', top suspicious IPs (top 5) ===")
    ip_grouped = df_users.groupby(['fraud_label','signup_ip']).size().reset_index(name='count')
    ip_sorted = ip_grouped.sort_values('count', ascending=False).head(5)
    print(ip_sorted)

# For device_id collisions
if 'device_id' in df_users.columns:
    print("\n=== Group by 'device_id' to see if multiple users share same device ===")
    device_grouped = df_users.groupby('device_id').size().reset_index(name='count')
    print(device_grouped.sort_values('count', ascending=False).head(10))
    avg_device_ownership = device_grouped['count'].mean()
    print(f"Average users per device_id: {avg_device_ownership:.2f}")


=== Group by 'fraud_label' and 'signup_ip', top suspicious IPs (top 5) ===
       fraud_label        signup_ip  count
39023            0    6.164.183.177     38
4526             0   131.35.138.120     36
17721            0   186.214.193.95     35
5337             0  135.184.211.113     35
28945            0   212.52.134.137     34

=== Group by 'device_id' to see if multiple users share same device ===
             device_id  count
0      device_AAAEEJZm      1
66650  device_iiSVnITA      1
66672  device_iipotFew      1
66671  device_iinAEhnz      1
66670  device_iimrfziy      1
66669  device_iimeqARs      1
66668  device_iiirccuo      1
66667  device_iiiMEtXc      1
66666  device_iihSDdqP      1
66665  device_iifwvvAg      1
Average users per device_id: 1.00


In [7]:
# =====================================================================
# Cell 5: Inspect df_businesses
# =====================================================================
print("\n=== Businesses: .info() ===")
df_businesses.info()

print("\n=== Businesses: .head(5) ===")
print(df_businesses.head(5))

print("\n=== Businesses: .describe() (numeric/time) ===")
print(df_businesses.describe(include=[int, float, 'datetime']))

print("\n=== Businesses: .describe(include='object') ===")
print(df_businesses.describe(include=[object]))

print("\n=== Businesses: Checking for missing values by column ===")
print(df_businesses.isnull().sum())

print("\n=== Businesses: Fraud Label Counts ===")
if 'fraud_label' in df_businesses.columns:
    biz_fraud_counts = df_businesses['fraud_label'].value_counts(dropna=False)
    print(biz_fraud_counts)
    # FRAUD RATIO
    fraud_ratio_biz = (df_businesses['fraud_label'] == 1).mean()
    print(f"Businesses Fraud Ratio: {fraud_ratio_biz:.4%}")
else:
    print("No 'fraud_label' column found in businesses.")

# Registration country freq
print("\n=== Businesses: Registration Country Frequency (top 10) ===")
if 'registration_country' in df_businesses.columns:
    print(df_businesses['registration_country'].value_counts(dropna=False).head(10))
else:
    print("No 'registration_country' column found in df_businesses.")

if 'owner_name' in df_businesses.columns:
    print("\n=== Businesses: Top 10 Owner Names by frequency ===")
    print(df_businesses['owner_name'].value_counts().head(10))


=== Businesses: .info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   business_id           10000 non-null  int64 
 1   business_name         10000 non-null  object
 2   registration_country  9759 non-null   object
 3   incorporation_date    10000 non-null  object
 4   owner_name            9805 non-null   object
 5   fraud_label           10000 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 468.9+ KB

=== Businesses: .head(5) ===
   business_id                         business_name registration_country  \
0            1                                   LLC                   MX   
1            2  strategize value-added web-readiness                   ER   
2            3                      Morales and Sons                   CY   
3            4            Vazquez, Watkins and Chase                   UA 

In [8]:
# =====================================================================
# Cell 6: Inspect df_relationships (User-Business)
# =====================================================================
print("\n=== Relationships: .info() ===")
df_relationships.info()

print("\n=== User-Biz: .head(5) ===")
print(df_relationships.head(5))

print("\n=== User-Biz: .describe() ===")
print(df_relationships.describe())

unique_users_in_rels = df_relationships['user_id'].nunique()
unique_biz_in_rels = df_relationships['business_id'].nunique()
print(f"\nUnique user_ids in relationships: {unique_users_in_rels}")
print(f"Unique business_ids in relationships: {unique_biz_in_rels}")

# Range checks
print("\n=== Relationship Range Checks ===")
min_user_id = df_relationships['user_id'].min()
max_user_id = df_relationships['user_id'].max()
min_biz_id = df_relationships['business_id'].min()
max_biz_id = df_relationships['business_id'].max()
print(f"User ID range in relationships: {min_user_id} to {max_user_id}")
print(f"Business ID range in relationships: {min_biz_id} to {max_biz_id}")


=== Relationships: .info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220942 entries, 0 to 220941
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      220942 non-null  int64
 1   business_id  220942 non-null  int64
dtypes: int64(2)
memory usage: 3.4 MB

=== User-Biz: .head(5) ===
   user_id  business_id
0        1         1252
1        1         8337
2       10         5422
3       10         6557
4       13         8288

=== User-Biz: .describe() ===
             user_id    business_id
count  220942.000000  220942.000000
mean    50073.553041    4998.785994
std     28852.443531    2883.843662
min         1.000000       1.000000
25%     25130.000000    2505.000000
50%     50101.500000    4998.000000
75%     74950.000000    7496.000000
max     99998.000000   10000.000000

Unique user_ids in relationships: 40224
Unique business_ids in relationships: 10000

=== Relationship Range Checks ===
User ID r

In [9]:
# =====================================================================
# Cell 7: Relationship distributions
# =====================================================================
# Businesses per user
rel_count_by_user = df_relationships.groupby('user_id').size().reset_index(name='num_businesses')
print("\n=== Top 10 users by number of businesses owned ===")
print(rel_count_by_user.sort_values('num_businesses', ascending=False).head(10))

avg_biz_per_user = rel_count_by_user['num_businesses'].mean()
print(f"Average businesses owned per user: {avg_biz_per_user:.2f}")

# Owners per business
rel_count_by_biz = df_relationships.groupby('business_id').size().reset_index(name='num_owners')
print("\n=== Top 10 businesses by number of owners ===")
print(rel_count_by_biz.sort_values('num_owners', ascending=False).head(10))

avg_users_per_biz = rel_count_by_biz['num_owners'].mean()
print(f"Average owners per business: {avg_users_per_biz:.2f}")

# Cross-check ownership by fraud
print("\n=== (Optional) Merge relationships with user + business fraud to see correlation ===")
df_rels_users = df_relationships.merge(
    df_users[['user_id','fraud_label']], on='user_id', how='left'
).rename(columns={'fraud_label':'user_fraud'})

df_merged = df_rels_users.merge(
    df_businesses[['business_id','fraud_label']], on='business_id', how='left'
).rename(columns={'fraud_label':'biz_fraud'})

print(f"Merged shape: {df_merged.shape}. Head(5):")
print(df_merged.head(5))

print("\n=== Cross-tab of user_fraud vs. biz_fraud ===")
ctab = pd.crosstab(df_merged['user_fraud'], df_merged['biz_fraud'], margins=True)
print(ctab)


=== Top 10 users by number of businesses owned ===
       user_id  num_businesses
40223    99998              10
30202    74950              10
8440     21200              10
30159    74859              10
30168    74880              10
30169    74881              10
30174    74892              10
8419     21142              10
30184    74916              10
30188    74921              10
Average businesses owned per user: 5.49

=== Top 10 businesses by number of owners ===
      business_id  num_owners
3197         3198          41
7492         7493          41
9759         9760          39
278           279          39
9965         9966          39
2146         2147          38
6052         6053          38
8917         8918          38
1550         1551          38
5975         5976          38
Average owners per business: 22.09

=== (Optional) Merge relationships with user + business fraud to see correlation ===
Merged shape: (220942, 4). Head(5):
   user_id  business_id  user_fra

In [10]:
# =====================================================================
# Cell 8: (Optional) Quick check of ring leader => out-degree in df_user_user
# =====================================================================
if not df_user_user.empty and 'is_ring_leader' in df_users.columns:
    print("\n=== Checking ring leader out-degree ===")
    # from_user_id => how many edges
    leader_edges = df_user_user.groupby('from_user_id').size().reset_index(name='out_degree')
    # join to see if they're ring leaders
    leader_merged = leader_edges.merge(
        df_users[['user_id', 'is_ring_leader']], left_on='from_user_id', right_on='user_id', how='left'
    )
    # filter to ring_leader == True
    ring_leader_only = leader_merged[leader_merged['is_ring_leader'] == True].sort_values('out_degree', ascending=False)
    print("Top ring leaders by out-degree:")
    print(ring_leader_only.head(10))

    # If we want to cross-check ring leader fraud
    if 'fraud_label' in df_users.columns:
        df_users_ring = df_users[['user_id','is_ring_leader','fraud_label']]
        ring_fraud_ctab = pd.crosstab(df_users_ring['is_ring_leader'], df_users_ring['fraud_label'], margins=True)
        print("\n=== Cross-tab: is_ring_leader vs. fraud_label ===")
        print(ring_fraud_ctab)

print("\nDone with extended EDA checks.")


=== Checking ring leader out-degree ===
Top ring leaders by out-degree:
     from_user_id  out_degree  user_id  is_ring_leader
402         84099          15    84099            True
419         86941          15    86941            True
299         65584          15    65584            True
399         83586          15    83586            True
398         83408          15    83408            True
212         42976          15    42976            True
215         43656          15    43656            True
388         82005          15    82005            True
222         45109          15    45109            True
94          19646          15    19646            True

=== Cross-tab: is_ring_leader vs. fraud_label ===
fraud_label         0      1     All
is_ring_leader                      
False           54378  45122   99500
True              146    354     500
All             54524  45476  100000

Done with extended EDA checks.
