In [0]:
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from pyspark.sql.functions import col, sum as _sum, year, from_unixtime
from faker import Faker
import random
from datetime import datetime, timedelta

# -------------------- KEY VARIABLES -------------------- #
catalog = 'main'
directory = f"/Volumes/{catalog}/finance_lakehouse/data_gen_outputs"
output_path = f"{directory}/inbound_contracts"

# Number of records to generate
# 1000 contracts per year starting from 2023
start_year = 2023
end_year = datetime.now().year
years = end_year - start_year + 1
num_records = 1000 * years

# Annual aggregate target: 1 Billion dollars
annual_target = 1_000_000_000
total_target_value = annual_target * years

In [0]:
"""
Generate synthetic data for the inbound_contracts table.
This script uses PySpark and maintains referential integrity with suppliers and legal entities.
"""

# Initialize Faker
fake = Faker()
Faker.seed(42)
random.seed(42)

# Define schema
schema = StructType([
    StructField("contract_id", LongType(), nullable=False),
    StructField("contract_number", StringType(), nullable=False),
    StructField("supplier_id", LongType(), nullable=False),
    StructField("legal_entity_id", LongType(), nullable=False),
    StructField("contract_currency", StringType(), nullable=False),
    StructField("total_contract_value", DoubleType(), nullable=False),
    StructField("contract_start_date", LongType(), nullable=False),
    StructField("estimated_completion_date", LongType(), nullable=False),
    StructField("contract_status", StringType(), nullable=False),
    StructField("contract_type", StringType(), nullable=False),
    StructField("contract_description", StringType(), nullable=False),
    StructField("payment_terms", StringType(), nullable=False)
])

# Enum values from schema
CONTRACT_STATUS = ["Active", "Pending", "Completed", "On-Hold"]
CONTRACT_TYPE = ["Fixed-Price", "Time-and-Materials", "Cost-Plus"]
PAYMENT_TERMS = ["Net-30", "Net-45", "Net-60"]

def generate_contract_value_pareto(target_avg, pareto_alpha=1.5):
    """
    Generate contract values following Pareto principle (80/20 rule).
    80% of total value should be in 20% of contracts.

    Args:
        target_avg: Target average contract value to maintain aggregate target
        pareto_alpha: Shape parameter for Pareto distribution (higher = less extreme)
    """
    # Pareto distribution - using alpha=1.5 for more controlled distribution
    pareto_value = random.paretovariate(pareto_alpha)

    # Scale to target average
    # The mean of paretovariate(alpha) is alpha/(alpha-1) for alpha > 1
    # For alpha=1.5, mean is 3.0
    expected_mean = pareto_alpha / (pareto_alpha - 1)

    # Scale and adjust to hit target average
    value = (pareto_value / expected_mean) * target_avg

    # Cap at reasonable maximum (100x average)
    return round(min(value, target_avg * 100), 2)

def generate_contract_description(contract_type):
    """Generate realistic contract description based on type."""
    descriptions = {
        "Fixed-Price": [
            "Procurement of materials and equipment",
            "Vendor services and support contract",
            "Supply and delivery of goods",
            "Facilities management services"
        ],
        "Time-and-Materials": [
            "Consulting and advisory services procurement",
            "IT and technical support services",
            "Professional services agreement",
            "Engineering services contract"
        ],
        "Cost-Plus": [
            "Research and development services",
            "Custom manufacturing procurement",
            "Complex project services with variable scope",
            "Long-term vendor services contract"
        ]
    }
    return random.choice(descriptions.get(contract_type, ["General procurement contract"]))

# Generate data
print(f"Generating {num_records} records for inbound contracts from {start_year} to {end_year}...")
print(f"Target total contract value: ${total_target_value:,.2f}")
print(f"Target average contract value: ${total_target_value/num_records:,.2f}")

# Read reference data
print("\nReading reference data...")
suppliers_df = spark.read.json(f"{directory}/suppliers")
coa_hierarchy_df = spark.read.json(f"{directory}/coa_hierarchy")

# Collect reference IDs for sampling
supplier_ids = [row.supplier_id for row in suppliers_df.select("supplier_id").collect()]
legal_entity_ids = [row.legal_entity_id for row in coa_hierarchy_df.select("legal_entity_id").distinct().collect()]

print(f"Available supplier_ids: {len(supplier_ids)}")
print(f"Available legal_entity_ids: {len(legal_entity_ids)}")

# Generate contract data
data = []
contract_id = 40000000  # Start with 8-digit ID
used_contract_numbers = set()
target_avg_value = total_target_value / num_records

# Create weighted sampling for Pareto distribution on supplier_id
# 20% of suppliers should get 80% of contracts
random.shuffle(supplier_ids)
pareto_cutoff = int(len(supplier_ids) * 0.2)
high_frequency_suppliers = supplier_ids[:pareto_cutoff]
low_frequency_suppliers = supplier_ids[pareto_cutoff:]

# Contract distribution over years (starting from 2023)
start_date_2023 = datetime(start_year, 1, 1)
end_date_current = datetime.now()

for i in range(num_records):
    # Generate unique contract number
    contract_number = f"INB{str(contract_id)[-6:]}"
    while contract_number in used_contract_numbers:
        contract_id += 1
        contract_number = f"INB{str(contract_id)[-6:]}"
    used_contract_numbers.add(contract_number)

    # Each contract linked to only 1 supplier (per YAML instruction)
    # Apply Pareto principle: 80% chance to pick from high-frequency suppliers
    if random.random() < 0.8 and high_frequency_suppliers:
        supplier_id = random.choice(high_frequency_suppliers)
    else:
        supplier_id = random.choice(low_frequency_suppliers if low_frequency_suppliers else supplier_ids)

    # Select random legal entity
    legal_entity_id = random.choice(legal_entity_ids)

    # Contract type and related fields
    contract_type = random.choice(CONTRACT_TYPE)

    # Generate contract value using Pareto principle
    total_contract_value = generate_contract_value_pareto(target_avg_value)

    # Generate dates starting from 2023
    total_days = (end_date_current - start_date_2023).days
    days_offset = random.randint(0, total_days)
    start_date = start_date_2023 + timedelta(days=days_offset)
    duration_days = random.randint(30, 730)  # 1 month to 2 years
    completion_date = start_date + timedelta(days=duration_days)

    contract_start_date = int(start_date.timestamp())
    estimated_completion_date = int(completion_date.timestamp())

    # Determine status based on dates
    current_date = int(datetime.now().timestamp())
    if current_date > estimated_completion_date:
        contract_status = "Completed"
    elif current_date < contract_start_date:
        contract_status = "Pending"
    else:
        contract_status = random.choices(
            ["Active", "On-Hold"],
            weights=[95, 5]  # More active contracts
        )[0]

    # Payment terms
    payment_terms = random.choice(PAYMENT_TERMS)

    data.append({
        "contract_id": contract_id,
        "contract_number": contract_number,
        "supplier_id": supplier_id,
        "legal_entity_id": legal_entity_id,
        "contract_currency": "USD",
        "total_contract_value": total_contract_value,
        "contract_start_date": contract_start_date,
        "estimated_completion_date": estimated_completion_date,
        "contract_status": contract_status,
        "contract_type": contract_type,
        "contract_description": generate_contract_description(contract_type),
        "payment_terms": payment_terms
    })

    contract_id += 1

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show sample
print("\nSample of generated data:")
df.show(10, truncate=False)

# Statistics
print(f"\nTotal records: {df.count()}")
print(f"Unique contract IDs: {df.select('contract_id').distinct().count()}")
print(f"Unique contract numbers: {df.select('contract_number').distinct().count()}")
print(f"Unique supplier IDs: {df.select('supplier_id').distinct().count()}")
print(f"Unique legal entity IDs: {df.select('legal_entity_id').distinct().count()}")

# Value statistics
print("\nContract value statistics:")
df.select("total_contract_value").summary("count", "min", "max", "mean", "stddev").show()

# Calculate actual total value
actual_total = df.agg({"total_contract_value": "sum"}).collect()[0][0]
print(f"\nActual total contract value: ${actual_total:,.2f}")
print(f"Target total contract value: ${total_target_value:,.2f}")
print(f"Difference: ${actual_total - total_target_value:,.2f}")

# Annual contract amount by contract start date
print("\nAnnual contract value by start year:")
from pyspark.sql.functions import from_unixtime, year, sum as _sum, count
df_with_year = df.withColumn("start_year", year(from_unixtime(col("contract_start_date"))))
annual_summary = df_with_year.groupBy("start_year").agg(
    _sum("total_contract_value").alias("total_value"),
    count("contract_id").alias("contract_count")
).orderBy("start_year")
annual_summary.show()

# Status distribution
print("\nContract status distribution:")
df.groupBy("contract_status").count().show()

# Type distribution
print("\nContract type distribution:")
df.groupBy("contract_type").count().show()

# Write to JSON
print(f"\nWriting data to {output_path}...")
df.coalesce(1).write.mode("overwrite").json(output_path)

print("Data generation complete!")
