In [0]:
import numpy as np
import random

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.window import Window

# ----------- DataFrames for Jobs --------------- #
income_statement = spark.read.table('jsp_demo.fin.income_statement')
chart_of_accounts = spark.read.table('jsp_demo.fin.chart_of_accounts')
cost_centers = spark.read.table('jsp_demo.fin.cost_centers')
projects = spark.read.table('jsp_demo.fin.projects')

# Legal Entities
LEGAL_ENTITIES = {
    'GB-US': {'name': 'GlobalBuild Americas Inc.', 'continent': 'Americas', 'currency': 'USD', 'revenue_split': 0.60},
    'GB-EU': {'name': 'GlobalBuild EMEA Ltd.', 'continent': 'EMEA', 'currency': 'EUR', 'revenue_split': 0.40}
}

# ============================================================================
# 12. FP&A DATA (Budget, Forecast, Actuals)
# ============================================================================

def generate_fpa_data(income_statement_df, chart_of_accounts_df, cost_centers_df, projects_df):
    """Generate FP&A planning data with Budget, Forecast, and Actuals using PySpark distributed compute"""

    # Get account types as DataFrames (no collect)
    revenue_accounts = chart_of_accounts_df.filter(F.col('account_type') == 'Revenue').select(
        'account_number', 'account_name', 'account_id'
    )
    cogs_accounts = chart_of_accounts_df.filter(F.col('account_type') == 'COGS').select(
        'account_number', 'account_name', 'account_id'
    )
    opex_accounts = chart_of_accounts_df.filter(F.col('account_type') == 'OPEX').select(
        'account_number', 'account_name', 'account_id'
    )

    cost_centers = cost_centers_df.select('cost_center_id')
    legal_entities = list(LEGAL_ENTITIES.keys())

    # Get counts for distribution
    revenue_count = revenue_accounts.count()
    cogs_count = cogs_accounts.count()
    opex_count = opex_accounts.count()
    cc_count = cost_centers.count()

    # Add indices for hash-based joins
    revenue_accounts = revenue_accounts.withColumn('acct_idx', F.monotonically_increasing_id())
    cogs_accounts = cogs_accounts.withColumn('acct_idx', F.monotonically_increasing_id())
    opex_accounts = opex_accounts.withColumn('acct_idx', F.monotonically_increasing_id())
    cost_centers = cost_centers.withColumn('cc_idx', F.monotonically_increasing_id())

    # ===== ACTUALS =====
    # Revenue Actuals
    revenue_actuals = income_statement_df.select('period', 'fiscal_year', 'revenue') \
        .crossJoin(revenue_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id')) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('le'))) % len(legal_entities)).cast('int')) + 1)) \
        .select(
            F.lit('Actuals').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round(F.col('revenue') / F.lit(revenue_count), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat('period', F.lit('-15')).alias('last_updated')
        )

    # COGS Actuals
    cogs_actuals = income_statement_df.select('period', 'fiscal_year', 'cost_of_revenue') \
        .crossJoin(cogs_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id')) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('le'))) % len(legal_entities)).cast('int')) + 1)) \
        .select(
            F.lit('Actuals').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round(F.col('cost_of_revenue') / F.lit(cogs_count), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat('period', F.lit('-15')).alias('last_updated')
        )

    # OPEX Actuals
    opex_actuals = income_statement_df.select('period', 'fiscal_year', 'total_operating_expenses') \
        .crossJoin(opex_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id')) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('le'))) % len(legal_entities)).cast('int')) + 1)) \
        .select(
            F.lit('Actuals').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round(F.col('total_operating_expenses') / F.lit(opex_count), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat('period', F.lit('-15')).alias('last_updated')
        )

    # ===== BUDGET =====
    # Revenue Budget (5-10% higher than actuals)
    revenue_budget = income_statement_df.select('period', 'fiscal_year', 'revenue') \
        .crossJoin(revenue_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id', F.lit('b'))) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('leb'))) % len(legal_entities)).cast('int')) + 1)) \
        .withColumn('variance_pct',
                    1.05 + ((F.hash(F.concat('account_id', F.lit('bvar'))) % 1000) / 1000.0 * 0.05)) \
        .select(
            F.lit('Budget').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round((F.col('revenue') / F.lit(revenue_count)) * F.col('variance_pct'), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat(F.col('fiscal_year').cast('string'), F.lit('-01-01')).alias('last_updated')
        )

    # COGS Budget (95-100% of actuals)
    cogs_budget = income_statement_df.select('period', 'fiscal_year', 'cost_of_revenue') \
        .crossJoin(cogs_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id', F.lit('b'))) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('leb'))) % len(legal_entities)).cast('int')) + 1)) \
        .withColumn('variance_pct',
                    0.95 + ((F.hash(F.concat('account_id', F.lit('bvar'))) % 1000) / 1000.0 * 0.05)) \
        .select(
            F.lit('Budget').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round((F.col('cost_of_revenue') / F.lit(cogs_count)) * F.col('variance_pct'), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat(F.col('fiscal_year').cast('string'), F.lit('-01-01')).alias('last_updated')
        )

    # OPEX Budget (95-100% of actuals)
    opex_budget = income_statement_df.select('period', 'fiscal_year', 'total_operating_expenses') \
        .crossJoin(opex_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id', F.lit('b'))) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('leb'))) % len(legal_entities)).cast('int')) + 1)) \
        .withColumn('variance_pct',
                    0.95 + ((F.hash(F.concat('account_id', F.lit('bvar'))) % 1000) / 1000.0 * 0.05)) \
        .select(
            F.lit('Budget').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round((F.col('total_operating_expenses') / F.lit(opex_count)) * F.col('variance_pct'), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat(F.col('fiscal_year').cast('string'), F.lit('-01-01')).alias('last_updated')
        )

    # ===== FORECAST =====
    # Revenue Forecast (100-105% of actuals)
    revenue_forecast = income_statement_df.select('period', 'fiscal_year', 'revenue') \
        .crossJoin(revenue_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id', F.lit('f'))) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('lef'))) % len(legal_entities)).cast('int')) + 1)) \
        .withColumn('variance_pct',
                    1.00 + ((F.hash(F.concat('account_id', F.lit('fvar'))) % 1000) / 1000.0 * 0.05)) \
        .select(
            F.lit('Forecast').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round((F.col('revenue') / F.lit(revenue_count)) * F.col('variance_pct'), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat(F.substring('period', 1, 7), F.lit('-01')).alias('last_updated')
        )

    # COGS Forecast (98-102% of actuals)
    cogs_forecast = income_statement_df.select('period', 'fiscal_year', 'cost_of_revenue') \
        .crossJoin(cogs_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id', F.lit('f'))) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('lef'))) % len(legal_entities)).cast('int')) + 1)) \
        .withColumn('variance_pct',
                    0.98 + ((F.hash(F.concat('account_id', F.lit('fvar'))) % 1000) / 1000.0 * 0.04)) \
        .select(
            F.lit('Forecast').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round((F.col('cost_of_revenue') / F.lit(cogs_count)) * F.col('variance_pct'), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat(F.substring('period', 1, 7), F.lit('-01')).alias('last_updated')
        )

    # OPEX Forecast (98-102% of actuals)
    opex_forecast = income_statement_df.select('period', 'fiscal_year', 'total_operating_expenses') \
        .crossJoin(opex_accounts) \
        .join(cost_centers,
              (F.hash(F.concat('period', 'account_id', F.lit('f'))) % 1000).cast('long') % F.lit(cc_count) ==
              (F.col('cc_idx') % F.lit(cc_count)),
              'inner') \
        .withColumn('legal_entity_code',
                    F.element_at(F.array([F.lit(le) for le in legal_entities]),
                                F.abs((F.hash(F.concat('period', 'account_id', F.lit('lef'))) % len(legal_entities)).cast('int')) + 1)) \
        .withColumn('variance_pct',
                    0.98 + ((F.hash(F.concat('account_id', F.lit('fvar'))) % 1000) / 1000.0 * 0.04)) \
        .select(
            F.lit('Forecast').alias('scenario'),
            'fiscal_year', 'period', 'account_number', 'account_name', 'cost_center_id', 'legal_entity_code',
            F.round((F.col('total_operating_expenses') / F.lit(opex_count)) * F.col('variance_pct'), 2).alias('amount'),
            F.lit('V1').alias('version'),
            F.concat(F.substring('period', 1, 7), F.lit('-01')).alias('last_updated')
        )

    # Union all scenarios
    result = revenue_actuals.unionAll(cogs_actuals).unionAll(opex_actuals) \
                           .unionAll(revenue_budget).unionAll(cogs_budget).unionAll(opex_budget) \
                           .unionAll(revenue_forecast).unionAll(cogs_forecast).unionAll(opex_forecast)

    return result

fpa_data = generate_fpa_data(
    income_statement_df=income_statement,  
    chart_of_accounts_df=chart_of_accounts,  
    cost_centers_df=cost_centers, 
    projects_df=projects,   
    )
    
fpa_data.writeTo("jsp_demo.fin.fpa_data").createOrReplace()                                 




