In [0]:

from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from pyspark.sql.functions import col
from faker import Faker
import random
from datetime import datetime, timedelta

# -------------------- KEY VARIABLES -------------------- #
catalog = 'fin_demo'
directory = f"/Volumes/{catalog}/fin/data_gen_outputs"
output_path = f"{directory}/employees"

# Number of employees to generate
num_records = 2000

In [0]:
"""
Generate synthetic data for the employees table.
This script uses PySpark and Faker to create realistic employee data with linear growth from 2022.
"""

# Initialize Faker
fake = Faker()
Faker.seed(42)
random.seed(42)


# Define schema
schema = StructType([
    StructField("employee_id", LongType(), nullable=False),
    StructField("employee_name", StringType(), nullable=False),
    StructField("salary", DoubleType(), nullable=False),
    StructField("cost_center_id", StringType(), nullable=False),
    StructField("hire_date", LongType(), nullable=False)
])

# Salary ranges by cost center
cost_center_salary_ranges = {
    "Executive": (150000, 500000),
    "Corporate": (80000, 200000),
    "Sales Department": (50000, 150000),
    "Marketing Department": (55000, 140000),
    "Engineering": (90000, 200000),
    "R&D": (85000, 180000),
    "Finance": (70000, 160000),
    "Manufacturing": (40000, 100000)
}

# Generate data
print(f"Generating {num_records} employee records with linear growth from 2022...")

# Read COA hierarchy data to get cost center IDs
coa_hierarchy_path = f"{directory}/coa_hierarchy"
print(f"Reading COA hierarchy data from {coa_hierarchy_path}...")
coa_df = spark.read.json(coa_hierarchy_path)

# Get unique cost centers
cost_centers = coa_df.select("cost_center_id", "cost_center_name").distinct().collect()
cost_center_list = [(row["cost_center_id"], row["cost_center_name"]) for row in cost_centers]

print(f"Found {len(cost_center_list)} cost centers")

# Start and end dates for hire_date distribution (2022-01-01 to present)
start_date = datetime(2022, 1, 1)
end_date = datetime.now()
total_days = (end_date - start_date).days

# Generate employee data with linear growth
data = []
used_names = set()
employee_id = 50000000  # Start with 8-digit ID

# Distribute employees over time with linear growth and slight variance
for i in range(num_records):
    # Linear growth: more recent hires
    # Use quadratic distribution to simulate linear growth (more hires in recent years)
    hire_progress = (random.random() ** 0.7)  # Bias towards more recent dates
    days_offset = int(hire_progress * total_days)
    hire_date = start_date + timedelta(days=days_offset)
    hire_timestamp = int(hire_date.timestamp())

    # Generate unique employee name
    while True:
        employee_name = fake.name()
        if employee_name not in used_names:
            used_names.add(employee_name)
            break

    # Select cost center (ensuring good distribution)
    cost_center_id, cost_center_name = cost_center_list[i % len(cost_center_list)]

    # Generate salary based on cost center
    salary_range = cost_center_salary_ranges.get(cost_center_name, (50000, 100000))
    salary = round(random.uniform(salary_range[0], salary_range[1]), 2)

    data.append({
        "employee_id": employee_id,
        "employee_name": employee_name,
        "salary": salary,
        "cost_center_id": str(cost_center_id),
        "hire_date": hire_timestamp
    })

    employee_id += 1

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show sample
print("\nSample of generated data:")
df.show(10, truncate=False)

print(f"\nTotal records: {df.count()}")
print(f"Unique employee IDs: {df.select('employee_id').distinct().count()}")
print(f"Unique employee names: {df.select('employee_name').distinct().count()}")
print(f"Unique cost centers: {df.select('cost_center_id').distinct().count()}")

# Show salary statistics by cost center
print("\nSalary statistics by cost center:")
df.groupBy("cost_center_id").agg(
    {"salary": "avg", "salary": "min", "salary": "max"}
).show()

# Write to JSON
print(f"\nWriting data to {output_path}...")
df.coalesce(1).write.mode("overwrite").json(output_path)

print("Data generation complete!")
