# 01 — Synthetic Data Generation

This notebook generates synthetic banking transaction data using CTGAN, Gaussian Copulas, or rule-based simulations.

**Goals:**
- Define schema (customer, transaction, fraud label)
- Train generative models or use rule-based patterns
- Produce raw → interim → processed datasets
- Validate distributions and fraud prevalence


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../')

from src.data.simulate_data import TransactionSimulator, generate_synthetic_data

RAW_DIR = Path("../data/raw")
INTERIM_DIR = Path("../data/interim")
PROCESSED_DIR = Path("../data/processed")

# Create directories
RAW_DIR.mkdir(parents=True, exist_ok=True)
INTERIM_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)


## 1. Define Schema + Rules  

Describe the variables and how fraud mechanisms behave.


In [None]:
# Schema Definition
# - transaction_id: Unique transaction identifier
# - customer_id: Customer identifier
# - timestamp: Transaction timestamp
# - amount: Transaction amount (log-normal distribution)
# - transaction_type: purchase, transfer, withdrawal, deposit, payment
# - merchant_category: retail, groceries, restaurant, gas, online, utility, other
# - country: Transaction country
# - account_balance_before: Account balance before transaction
# - fraud: Binary fraud label (0.5% fraud rate)

# Fraud Patterns:
# 1. Unusually large amounts (30% of fraud)
# 2. Rapid successive transactions - burst pattern (25% of fraud)
# 3. Unusual geographic locations (20% of fraud)
# 4. Unusual transaction types for customer (15% of fraud)
# 5. Off-hours transactions (2-5 AM) (10% of fraud)


## 2. Generate Base Transactions


In [None]:
# Initialize simulator
simulator = TransactionSimulator(
    n_customers=10000,
    n_transactions=500000,
    fraud_rate=0.005,  # 0.5% fraud rate
    random_state=42
)

# Generate base transactions
print("Generating base transactions...")
df_base = simulator.generate_base_transactions()
print(f"Generated {len(df_base):,} base transactions")
print(f"\nSample transactions:")
df_base.head()


## 3. Inject Fraud Patterns
Define realistic fraud signals (bursts, large amounts, abnormal locations, etc.)


In [None]:
# Inject fraud patterns
df = simulator.inject_fraud_patterns(df_base)

print(f"\nFraud Statistics:")
print(f"Total transactions: {len(df):,}")
print(f"Fraudulent transactions: {df['fraud'].sum():,} ({df['fraud'].mean():.2%})")
print(f"\nFraud by transaction type:")
print(df[df['fraud']==1]['transaction_type'].value_counts())
print(f"\nFraud by country:")
print(df[df['fraud']==1]['country'].value_counts().head(10))


## 4. Save Raw Dataset


In [None]:
# Save raw dataset
output_path = RAW_DIR / "transactions_raw.csv"
df.to_csv(output_path, index=False)
print(f"Saved {len(df):,} transactions to {output_path}")

# Display summary statistics
print("\n" + "="*60)
print("Dataset Summary")
print("="*60)
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Unique customers: {df['customer_id'].nunique():,}")
print(f"Fraud rate: {df['fraud'].mean():.2%}")
print(f"\nAmount statistics:")
print(df['amount'].describe())
print(f"\nFraud amount statistics:")
print(df[df['fraud']==1]['amount'].describe())
