In [0]:
from pyspark.sql.types import StructType, StructField, LongType, StringType
from faker import Faker
import random

# -------------------- KEY VARIABLES -------------------- #
catalog = 'main'
directory = f"/Volumes/{catalog}/finance_lakehouse/data_gen_outputs"
output_path = f"{directory}/third_parties"

num_records=1000

In [0]:
"""
Generate synthetic data for the third_parties table.
This script uses PySpark and Faker to create realistic third-party vendor/partner data.
"""

# Initialize Faker
fake = Faker()
Faker.seed(42)
random.seed(42)

# Define schema
schema = StructType([
    StructField("third_party_id", LongType(), nullable=False),
    StructField("third_party_name", StringType(), nullable=False)
])

# Common third party types for finance departments
third_party_types = [
    "Bank", "Insurance", "Vendor", "Supplier", "Consulting",
    "Auditing", "Legal Services", "Tax Services", "Payroll Services",
    "Investment", "Leasing", "Credit", "Benefits Provider"
]

# Common company suffixes
company_suffixes = [
    "Inc", "LLC", "Corp", "Corporation", "Group", "Partners",
    "Associates", "Solutions", "Services", "Financial", "Advisors",
    "Consulting", "Company", "Co"
]

def generate_third_party_name():
    """
    Generate a realistic third-party company name.
    Uses common business names and financial service patterns.
    """
    patterns = [
        lambda: f"{fake.company().split()[0]} {random.choice(third_party_types)} {random.choice(company_suffixes)}",
        lambda: f"{fake.last_name()} & {fake.last_name()} {random.choice(company_suffixes)}",
        lambda: f"{fake.city().split()[0]} {random.choice(third_party_types)} {random.choice(company_suffixes)}",
        lambda: f"American {random.choice(third_party_types)} {random.choice(company_suffixes)}",
        lambda: f"Global {random.choice(third_party_types)} {random.choice(company_suffixes)}",
        lambda: f"{fake.last_name()} {random.choice(third_party_types)}",
    ]

    return random.choice(patterns)()

def generate_third_parties_data():
    """
    Generate third parties data with unique IDs and names.
    """
    data = []
    used_names = set()

    # Generate unique third party records
    third_party_id = 10000000  # Start with 8-digit ID

    while len(data) < num_records:
        # Generate unique name
        third_party_name = generate_third_party_name()

        # Ensure uniqueness
        if third_party_name not in used_names:
            used_names.add(third_party_name)

            data.append({
                "third_party_id": third_party_id,
                "third_party_name": third_party_name
            })

            third_party_id += 1

    return data

# Generate data
print(f"Generating {num_records} records for third parties...")
data = generate_third_parties_data()

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show sample
print("\nSample of generated data:")
df.show(10, truncate=False)

print(f"\nTotal records: {df.count()}")
print(f"Unique third party IDs: {df.select('third_party_id').distinct().count()}")
print(f"Unique third party names: {df.select('third_party_name').distinct().count()}")

# Write to JSON
print(f"\nWriting data to {output_path}...")
df.coalesce(1).write.mode("overwrite").json(output_path)

print("Data generation complete!")
