In [3]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta


random.seed(42)


In [4]:


# Define sample data generation functions
def generate_transaction_id(n):
    return [f"TX{str(i).zfill(6)}" for i in range(1, n+1)]

def generate_account_id(n):
    return [f"ACC{str(i).zfill(6)}" for i in range(1, n+1)]

def generate_block_hash(n):
    return [f"BLOCK{str(i).zfill(6)}" for i in range(1, n+1)]

def generate_ip_address():
    return '.'.join(str(random.randint(0, 255)) for _ in range(4))

def generate_device_info():
    return random.choice(["DeviceA", "DeviceB", "DeviceC", "DeviceD"])

def generate_geolocation():
    return random.choice(["US", "EU", "Asia", "Africa", "Oceania"])

def generate_transaction_type():
    return random.choice(["Regular Transfer", "Smart Contract Interaction"])

def generate_suspicious_flag():
    return random.choice([0, 1])

# Parameters
num_transactions = 1000
num_accounts = 500

# Generate synthetic transaction data
transaction_data = pd.DataFrame({
    'Transaction ID': generate_transaction_id(num_transactions),
    'Sender Address': random.choices(generate_account_id(num_accounts), k=num_transactions),
    'Receiver Address': random.choices(generate_account_id(num_accounts), k=num_transactions),
    'Amount Transferred': [round(random.uniform(0.01, 100), 2) for _ in range(num_transactions)],
    'Timestamp': [datetime.now() - timedelta(minutes=random.randint(1, 1440)) for _ in range(num_transactions)],
    'Transaction Type': [generate_transaction_type() for _ in range(num_transactions)],
    'Fraudulent Label': [generate_suspicious_flag() for _ in range(num_transactions)],
    'Suspicious Transaction Flags': [generate_suspicious_flag() for _ in range(num_transactions)],
    'Block Hash': generate_block_hash(num_transactions),
    'Block Number': [random.randint(100000, 999999) for _ in range(num_transactions)],
    'Mining Pool Information': random.choices(["PoolA", "PoolB", "PoolC", "PoolD"], k=num_transactions)
})

# Generate account data
account_data = pd.DataFrame({
    'Account ID': generate_account_id(num_accounts),
    'Account Creation Date': [datetime.now() - timedelta(days=random.randint(30, 365)) for _ in range(num_accounts)],
    'Account Activity Level': [random.choice(["Low", "Medium", "High"]) for _ in range(num_accounts)],
    'Account Age': [random.randint(1, 365) for _ in range(num_accounts)],
    'Account Balance': [round(random.uniform(0, 10000), 2) for _ in range(num_accounts)],
    'Account Reputation Score': [random.randint(0, 100) for _ in range(num_accounts)]
})

# Generate behavioral features
behavioral_data = pd.DataFrame({
    'Account ID': generate_account_id(num_accounts),
    'Transaction Volume': [round(random.uniform(100, 10000), 2) for _ in range(num_accounts)],
    'Transaction Frequency': [random.randint(1, 100) for _ in range(num_accounts)],
    'Number of Distinct Counterparties': [random.randint(1, 50) for _ in range(num_accounts)],
    'Time Between Transactions': [random.uniform(1, 24) for _ in range(num_accounts)],  # in hours
    'Number of Transactions per Day': [random.randint(1, 10) for _ in range(num_accounts)],
    'Pattern Consistency': random.choices([0, 1], k=num_accounts)  # 0: Inconsistent, 1: Consistent
})

# Generate Sybil-specific indicators
sybil_data = pd.DataFrame({
    'Account ID': generate_account_id(num_accounts),
    'Shared IP Address': [generate_ip_address() for _ in range(num_accounts)],
    'Device Information': [generate_device_info() for _ in range(num_accounts)],
    'Account Linking': random.choices([0, 1], k=num_accounts),  # 1: Linked, 0: Not Linked
    'Geographical Location': [generate_geolocation() for _ in range(num_accounts)],
    'Device Fingerprinting': [f"Fingerprint_{i}" for i in range(num_accounts)]
})

# Merge all dataframes to simulate the final dataset
final_dataset = transaction_data.merge(account_data, left_on='Sender Address', right_on='Account ID', how='left')
final_dataset = final_dataset.merge(behavioral_data, on='Account ID', how='left')
final_dataset = final_dataset.merge(sybil_data, on='Account ID', how='left')

# Drop the extra 'Account ID' column as it's no longer needed
final_dataset = final_dataset.drop(columns=['Account ID'])

# Display the first few rows of the final dataset
print(final_dataset.head())

  Transaction ID Sender Address Receiver Address  Amount Transferred  \
0       TX000001      ACC000320        ACC000050               14.33   
1       TX000002      ACC000013        ACC000343               66.02   
2       TX000003      ACC000138        ACC000273               22.11   
3       TX000004      ACC000112        ACC000489               30.06   
4       TX000005      ACC000369        ACC000180                6.11   

                   Timestamp            Transaction Type  Fraudulent Label  \
0 2025-02-26 23:32:59.344721  Smart Contract Interaction                 1   
1 2025-02-26 21:02:59.344721  Smart Contract Interaction                 0   
2 2025-02-26 19:48:59.344721            Regular Transfer                 0   
3 2025-02-27 11:34:59.344721            Regular Transfer                 1   
4 2025-02-27 13:42:59.344721            Regular Transfer                 0   

   Suspicious Transaction Flags   Block Hash  Block Number  ...  \
0                             1

In [5]:
final_dataset.to_csv("generated_data.csv")