In [0]:
import numpy as np
import random

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.window import Window

# ----------- DataFrames for Jobs --------------- #
income_statement = spark.read.table('jsp_demo.fin.income_statement')
chart_of_accounts = spark.read.table('jsp_demo.fin.chart_of_accounts')
cost_centers = spark.read.table('jsp_demo.fin.cost_centers')
projects = spark.read.table('jsp_demo.fin.projects')
inbound_contracts = spark.read.table('jsp_demo.fin.inbound_contracts')
outbound_contracts = spark.read.table('jsp_demo.fin.outbound_contracts')

# Legal Entities
LEGAL_ENTITIES = {
    'GB-US': {'name': 'GlobalBuild Americas Inc.', 'continent': 'Americas', 'currency': 'USD', 'revenue_split': 0.60},
    'GB-EU': {'name': 'GlobalBuild EMEA Ltd.', 'continent': 'EMEA', 'currency': 'EUR', 'revenue_split': 0.40}
}

In [0]:
# ============================================================================
# 7. GL ENTRIES 
# ============================================================================

def generate_gl_entries(income_statement_df, chart_of_accounts_df, cost_centers_df,
                       projects_df, inbound_contracts_df, outbound_contracts_df):
    """Generate GL journal entries using PySpark distributed compute"""

    from pyspark.sql.window import Window

    # Get account lookup as broadcast
    acct_lookup = chart_of_accounts_df.select('account_number', 'account_name').distinct()

    # Get active projects with start date for filtering
    active_projects = projects_df.filter(F.col('status') == 'Active') \
        .select('project_id', 'project_name', 'legal_entity_code', 'start_date')

    # Get cost centers
    cost_centers = cost_centers_df.select('cost_center_id')

    # Add index to distribute pseudo-randomly
    projects_with_idx = active_projects.withColumn('proj_idx', F.monotonically_increasing_id())
    cc_with_idx = cost_centers.withColumn('cc_idx', F.monotonically_increasing_id())

    legal_entities = list(LEGAL_ENTITIES.keys())

    # ===== REVENUE ENTRIES =====
    revenue_accounts = ['4100', '4110', '4120', '4130']

    revenue_base = income_statement_df.select('period', 'revenue') \
        .withColumn('entry_seq', F.explode(F.array([F.lit(i) for i in range(5)]))) \
        .withColumn('entry_type', F.lit('Revenue Recognition'))

    # Join with projects using hash-based distribution
    revenue_entries = revenue_base.join(
        projects_with_idx,
        (F.hash(F.concat('period', 'entry_seq')) % 1000).cast('long') % F.lit(projects_with_idx.count()) ==
        (F.col('proj_idx') % F.lit(projects_with_idx.count())),
        'inner'
    ).withColumn(
        'revenue_acct',
        F.element_at(F.array([F.lit(acc) for acc in revenue_accounts]),
                     ((F.hash(F.concat('period', 'entry_seq')) % 4).cast('int') + 1))
    ).withColumn(
        'amount', F.round(F.col('revenue') / 5, 2)
    ).withColumn('entry_date', F.concat('period', F.lit('-01')))

    # Create debit and credit pairs
    revenue_debit = revenue_entries.select(
        F.concat(F.lit('JE'), (F.monotonically_increasing_id() * 2 + 100000).cast('string')).alias('entry_id'),
        'entry_date', 'period', 'entry_type',
        F.lit('1200').alias('account_number'),
        F.col('amount').alias('debit'),
        F.lit(0.0).alias('credit'),
        'legal_entity_code', 'project_id',
        F.concat(F.lit('Revenue for '), 'project_name').alias('description')
    )

    revenue_credit = revenue_entries.select(
        F.concat(F.lit('JE'), (F.monotonically_increasing_id() * 2 + 100001).cast('string')).alias('entry_id'),
        'entry_date', 'period', 'entry_type',
        F.col('revenue_acct').alias('account_number'),
        F.lit(0.0).alias('debit'),
        F.col('amount').alias('credit'),
        'legal_entity_code', 'project_id',
        F.concat(F.lit('Revenue for '), 'project_name').alias('description')
    )

    # ===== COGS ENTRIES =====
    cogs_accounts = ['5100', '5200', '5300', '5400', '5500']

    cogs_base = income_statement_df.select('period', 'cost_of_revenue') \
        .withColumn('entry_seq', F.explode(F.array([F.lit(i) for i in range(5)]))) \
        .withColumn('entry_type', F.lit('COGS Expense'))

    cogs_entries = cogs_base.join(
        projects_with_idx,
        (F.hash(F.concat('period', 'entry_seq', F.lit('cogs'))) % 1000).cast('long') % F.lit(projects_with_idx.count()) ==
        (F.col('proj_idx') % F.lit(projects_with_idx.count())),
        'inner'
    ).withColumn(
        'cogs_acct',
        F.element_at(F.array([F.lit(acc) for acc in cogs_accounts]),
                     ((F.hash(F.concat('period', 'entry_seq')) % 5).cast('int') + 1))
    ).withColumn(
        'amount', F.round(F.col('cost_of_revenue') / 5, 2)
    ).withColumn('entry_date', F.concat('period', F.lit('-01')))

    cogs_debit = cogs_entries.select(
        F.concat(F.lit('JE'), (F.monotonically_increasing_id() * 2 + 200000).cast('string')).alias('entry_id'),
        'entry_date', 'period', 'entry_type',
        F.col('cogs_acct').alias('account_number'),
        F.col('amount').alias('debit'),
        F.lit(0.0).alias('credit'),
        'legal_entity_code', 'project_id',
        F.concat(F.lit('COGS for '), 'project_name').alias('description')
    )

    cogs_credit = cogs_entries.select(
        F.concat(F.lit('JE'), (F.monotonically_increasing_id() * 2 + 200001).cast('string')).alias('entry_id'),
        'entry_date', 'period', 'entry_type',
        F.lit('2100').alias('account_number'),
        F.lit(0.0).alias('debit'),
        F.col('amount').alias('credit'),
        'legal_entity_code', 'project_id',
        F.concat(F.lit('COGS for '), 'project_name').alias('description')
    )

    # ===== OPEX ENTRIES =====
    opex_accounts = ['6100', '6110', '6200', '6300', '6400', '6500', '6600', '6700', '6800', '6900']

    opex_base = income_statement_df.select('period', 'total_operating_expenses') \
        .withColumn('account_number', F.explode(F.array([F.lit(acc) for acc in opex_accounts]))) \
        .withColumn('entry_type', F.lit('Operating Expense')) \
        .withColumn(
            'legal_entity_code',
            F.element_at(F.array([F.lit(le) for le in legal_entities]),
                         ((F.hash(F.concat('period', 'account_number')) % len(legal_entities)).cast('int') + 1))
        ).withColumn('entry_date', F.concat('period', F.lit('-01'))) \
        .withColumn('amount', F.round(F.col('total_operating_expenses') / 10, 2))

    opex_debit = opex_base.select(
        F.concat(F.lit('JE'), (F.monotonically_increasing_id() * 2 + 300000).cast('string')).alias('entry_id'),
        'entry_date', 'period', 'entry_type', 'account_number',
        F.col('amount').alias('debit'),
        F.lit(0.0).alias('credit'),
        'legal_entity_code',
        F.lit(None).cast('string').alias('project_id'),
        F.lit('Monthly operating expense').alias('description')
    )

    opex_credit = opex_base.select(
        F.concat(F.lit('JE'), (F.monotonically_increasing_id() * 2 + 300001).cast('string')).alias('entry_id'),
        'entry_date', 'period', 'entry_type',
        F.lit('2100').alias('account_number'),
        F.lit(0.0).alias('debit'),
        F.col('amount').alias('credit'),
        'legal_entity_code',
        F.lit(None).cast('string').alias('project_id'),
        F.lit('Monthly operating expense').alias('description')
    )

    # Union all entries
    all_entries = revenue_debit.unionAll(revenue_credit) \
                               .unionAll(cogs_debit).unionAll(cogs_credit) \
                               .unionAll(opex_debit).unionAll(opex_credit)

    # Join with cost centers (simplified - using first cost center)
    first_cc = cost_centers.limit(1).select('cost_center_id')
    all_entries = all_entries.crossJoin(first_cc)

    # Join with account names
    result = all_entries.join(acct_lookup, 'account_number', 'left') \
        .withColumn('contract_id', F.lit(None).cast('string')) \
        .select(
            'entry_id', 'entry_date', 'period', 'entry_type',
            'account_number', 'account_name',
            'debit', 'credit', 'legal_entity_code', 'cost_center_id',
            'project_id', 'contract_id', 'description'
        )

    return result


gl_entries = generate_gl_entries(
    income_statement_df=income_statement,  
    chart_of_accounts_df=chart_of_accounts,  
    cost_centers_df=cost_centers, 
    projects_df=projects,  
    inbound_contracts_df=inbound_contracts,  
    outbound_contracts_df=outbound_contracts    
    )
    
outbound_contracts.writeTo("jsp_demo.fin.gl_entries").createOrReplace()                                 



In [0]:
# ============================================================================
# 8. REVENUE TRANSACTIONS
# ============================================================================

def generate_revenue_transactions(inbound_contracts_df, income_statement_df):
    """Generate detailed revenue transactions (invoices, payments) using PySpark distributed compute"""

    # Filter active contracts and calculate months duration
    contracts = inbound_contracts_df.filter(F.col('status') == 'Active').select(
        'contract_id', 'project_id', 'customer_name', 'legal_entity_code',
        'contract_value', 'contract_start_date', 'contract_end_date'
    ).withColumn(
        'months_in_contract',
        F.months_between(F.col('contract_end_date'), F.col('contract_start_date')) + 1
    ).withColumn(
        'monthly_amount',
        F.round(F.col('contract_value') / F.col('months_in_contract'), 2)
    )

    # Generate month sequence for each contract using explode
    # Create array of month offsets (0 to 48 max months)
    month_range = F.array([F.lit(i) for i in range(60)])  # Max 60 months

    contracts_with_months = contracts.withColumn('month_offset', F.explode(month_range)) \
        .withColumn(
            'invoice_date',
            F.expr("add_months(contract_start_date, month_offset)")
        ).filter(
            (F.col('invoice_date') >= F.lit('2021-09-01')) &
            (F.col('invoice_date') <= F.least(F.col('contract_end_date'), F.lit('2025-08-31')))
        ).withColumn(
            'due_date',
            F.date_add(F.col('invoice_date'), 45)
        ).withColumn(
            'payment_days',
            ((F.hash(F.concat('contract_id', 'month_offset')) % 31).cast('int') + 30)
        ).withColumn(
            'payment_date',
            F.date_add(F.col('invoice_date'), F.col('payment_days'))
        )

    # Create invoice transactions
    invoices = contracts_with_months.select(
        F.concat(F.lit('INV-'), (F.monotonically_increasing_id() + 500000).cast('string')).alias('transaction_id'),
        F.lit('Invoice').alias('transaction_type'),
        F.col('invoice_date').cast('string').alias('transaction_date'),
        'contract_id', 'project_id', 'customer_name', 'legal_entity_code',
        F.col('monthly_amount').alias('invoice_amount'),
        F.lit(0.0).alias('payment_amount'),
        F.col('due_date').cast('string').alias('due_date'),
        F.lit(None).cast('string').alias('payment_date'),
        F.lit('Invoiced').alias('status'),
        F.concat(F.lit('Progress billing - Month '), (F.col('month_offset') + 1).cast('string')).alias('description')
    )

    # Create payment transactions (only where payment_date <= 2025-08-31)
    payments = contracts_with_months.filter(
        F.col('payment_date') <= F.lit('2025-08-31')
    ).select(
        F.concat(F.lit('PMT-'), (F.monotonically_increasing_id() + 500000).cast('string')).alias('transaction_id'),
        F.lit('Payment').alias('transaction_type'),
        F.col('payment_date').cast('string').alias('transaction_date'),
        'contract_id', 'project_id', 'customer_name', 'legal_entity_code',
        F.lit(0.0).alias('invoice_amount'),
        F.col('monthly_amount').alias('payment_amount'),
        F.lit(None).cast('string').alias('due_date'),
        F.col('payment_date').cast('string').alias('payment_date'),
        F.lit('Paid').alias('status'),
        F.concat(F.lit('Payment received - Month '), (F.col('month_offset') + 1).cast('string')).alias('description')
    )

    # Union invoices and payments
    result = invoices.unionAll(payments)

    return result


revenue_transactions = generate_revenue_transactions(inbound_contracts, income_statement)
revenue_transactions.writeTo('jsp_demo.fin.revenue_transactions').createOrReplace()



In [0]:
# ============================================================================
# 9. SPEND TRANSACTIONS
# ============================================================================

def generate_spend_transactions(outbound_contracts_df):
    """Generate detailed spend transactions (POs, invoices, payments) using PySpark distributed compute"""

    # Filter active contracts
    contracts = outbound_contracts_df.filter(F.col('status') == 'Active').select(
        'contract_id', 'project_id', 'vendor_name', 'contract_category',
        'legal_entity_code', 'contract_value', 'contract_start_date',
        'contract_end_date', 'contract_description'
    )

    # Generate Purchase Orders (one per contract)
    pos = contracts.filter(
        (F.col('contract_start_date') >= F.lit('2021-09-01')) &
        (F.col('contract_start_date') <= F.lit('2025-08-31'))
    ).select(
        F.concat(F.lit('PO-'), (F.monotonically_increasing_id() + 600000).cast('string')).alias('transaction_id'),
        F.lit('Purchase Order').alias('transaction_type'),
        F.col('contract_start_date').cast('string').alias('transaction_date'),
        'contract_id', 'project_id', 'vendor_name',
        F.col('contract_category').alias('vendor_category'),
        'legal_entity_code',
        F.round(F.col('contract_value'), 2).alias('po_amount'),
        F.lit(0.0).alias('invoice_amount'),
        F.lit(0.0).alias('payment_amount'),
        F.lit('Approved').alias('status'),
        F.concat(F.lit('PO for '), 'contract_description').alias('description')
    )

    # Calculate contract duration and num invoices
    contracts_with_invoices = contracts.withColumn(
        'days_between',
        F.datediff(F.col('contract_end_date'), F.col('contract_start_date'))
    ).withColumn(
        'num_invoices',
        F.greatest(F.lit(1), F.least(F.lit(4), ((F.hash('contract_id') % 3) + 2).cast('int')))
    ).withColumn(
        'invoice_amount',
        F.round(F.col('contract_value') / F.col('num_invoices'), 2)
    )

    # Generate invoice sequence using explode
    invoice_range = F.array([F.lit(i) for i in range(4)])  # Max 4 invoices per contract

    invoices_base = contracts_with_invoices.withColumn('inv_idx', F.explode(invoice_range)) \
        .filter(F.col('inv_idx') < F.col('num_invoices')) \
        .withColumn(
            'inv_date',
            F.expr("date_add(contract_start_date, cast((days_between / num_invoices) * inv_idx as int))")
        ).filter(
            (F.col('inv_date') >= F.lit('2021-09-01')) &
            (F.col('inv_date') <= F.lit('2025-08-31'))
        ).withColumn(
            'payment_days',
            ((F.hash(F.concat('contract_id', 'inv_idx')) % 16).cast('int') + 30)
        ).withColumn(
            'payment_date',
            F.date_add(F.col('inv_date'), F.col('payment_days'))
        )

    # Create vendor invoices
    invoices = invoices_base.select(
        F.concat(F.lit('VINV-'), (F.monotonically_increasing_id() + 700000).cast('string')).alias('transaction_id'),
        F.lit('Vendor Invoice').alias('transaction_type'),
        F.col('inv_date').cast('string').alias('transaction_date'),
        'contract_id', 'project_id', 'vendor_name',
        F.col('contract_category').alias('vendor_category'),
        'legal_entity_code',
        F.lit(0.0).alias('po_amount'),
        F.col('invoice_amount'),
        F.lit(0.0).alias('payment_amount'),
        F.lit('Received').alias('status'),
        F.concat(F.lit('Invoice from '), 'vendor_name').alias('description')
    )

    # Create vendor payments (only where payment_date <= 2025-08-31)
    payments = invoices_base.filter(
        F.col('payment_date') <= F.lit('2025-08-31')
    ).select(
        F.concat(F.lit('VPMT-'), (F.monotonically_increasing_id() + 800000).cast('string')).alias('transaction_id'),
        F.lit('Vendor Payment').alias('transaction_type'),
        F.col('payment_date').cast('string').alias('transaction_date'),
        'contract_id', 'project_id', 'vendor_name',
        F.col('contract_category').alias('vendor_category'),
        'legal_entity_code',
        F.lit(0.0).alias('po_amount'),
        F.lit(0.0).alias('invoice_amount'),
        F.col('invoice_amount').alias('payment_amount'),
        F.lit('Paid').alias('status'),
        F.concat(F.lit('Payment to '), 'vendor_name').alias('description')
    )

    # Union all transaction types
    result = pos.unionAll(invoices).unionAll(payments)

    return result


spend_transactions = generate_spend_transactions(outbound_contracts)
spend_transactions.writeTo('jsp_demo.fin.spend_transactions').createOrReplace()