In [0]:

from pyspark.sql.types import StructType, StructField, LongType, StringType
from faker import Faker
import random

# -------------------- KEY VARIABLES -------------------- #
catalog = 'fin_demo'
directory = f"/Volumes/{catalog}/fin/data_gen_outputs"
output_path = f"{directory}/coa_hierarchy"


In [0]:
"""
Generate synthetic data for the COA (Chart of Accounts) hierarchy table.
This script uses PySpark and Faker to create realistic accounting data.
"""

# Initialize Faker
fake = Faker()
Faker.seed(42)
random.seed(42)

# Define schema
schema = StructType([
    StructField("coa_id", LongType(), nullable=False),
    StructField("cost_center_id", LongType(), nullable=False),
    StructField("cost_center_name", StringType(), nullable=False),
    StructField("legal_entity_id", LongType(), nullable=False),
    StructField("legal_entity_name", StringType(), nullable=False)
])

# Common cost center names for accounting (from YAML enum)
cost_centers = [
    "Executive", "Corporate", "Sales Department", "Marketing Department",
    "Engineering", "R&D", "Finance", "Manufacturing"
]

# Legal entity data (5 legal entities as per YAML instructions)
legal_entities = [
    {"id": 100001, "name": "Acme Corporation"},
    {"id": 100002, "name": "Global Industries Inc"},
    {"id": 100003, "name": "TechVentures LLC"},
    {"id": 100004, "name": "International Holdings Ltd"},
    {"id": 100005, "name": "Enterprise Solutions Group"}
]

def generate_coa_data():
    """
    Generate COA hierarchy data with realistic accounting relationships.
    Creates one record for each combination of cost center and legal entity.
    """
    data = []

    # Generate unique cost center IDs
    num_cost_centers = len(cost_centers)
    cost_center_ids = list(range(1000, 1000 + num_cost_centers))

    # Number of legal entities
    num_legal_entities = len(legal_entities)

    # Start coa_id at 70000000 (8-digit ID)
    coa_id = 70000000

    # Generate one record for each cost center and legal entity combination
    for cost_center_idx in range(num_cost_centers):
        for legal_entity_idx in range(num_legal_entities):
            # Select cost center
            cost_center_id = cost_center_ids[cost_center_idx]
            cost_center_name = cost_centers[cost_center_idx]

            # Select legal entity
            legal_entity_id = legal_entities[legal_entity_idx]["id"]
            legal_entity_name = legal_entities[legal_entity_idx]["name"]

            data.append({
                "coa_id": coa_id,
                "cost_center_id": cost_center_id,
                "cost_center_name": cost_center_name,
                "legal_entity_id": legal_entity_id,
                "legal_entity_name": legal_entity_name
            })

            coa_id += 1

    return data

# Generate data
print("Generating COA hierarchy data...")
data = generate_coa_data()

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show sample
print("\nSample of generated data:")
df.show(10, truncate=False)

print(f"\nTotal records: {df.count()}")
print(f"Unique COA IDs: {df.select('coa_id').distinct().count()}")
print(f"Unique cost centers: {df.select('cost_center_id').distinct().count()}")
print(f"Unique legal entities: {df.select('legal_entity_id').distinct().count()}")

# Write to JSON
print(f"\nWriting data to {output_path}...")
df.coalesce(1).write.mode("overwrite").json(output_path)

print("Data generation complete!")
