In [0]:
%sql
CREATE CATALOG IF NOT EXISTS jsp_demo;
CREATE SCHEMA IF NOT EXISTS jsp_demo.fin;

In [0]:
import numpy as np
import random

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType


# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# ============================================================================
# COMPANY PROFILE
# ============================================================================

COMPANY_NAME = "GlobalBuild Construction Inc."
INDUSTRY = "Construction & Engineering"
FISCAL_YEAR_END = "December 31"
BASE_ANNUAL_REVENUE = 10_000_000_000  # $10B
YOY_GROWTH_RATE = 0.05  # 5% annual growth
NUM_MONTHS = 48

# Legal Entities
LEGAL_ENTITIES = {
    'GB-US': {'name': 'GlobalBuild Americas Inc.', 'continent': 'Americas', 'currency': 'USD', 'revenue_split': 0.60},
    'GB-EU': {'name': 'GlobalBuild EMEA Ltd.', 'continent': 'EMEA', 'currency': 'EUR', 'revenue_split': 0.40}
}

In [0]:
# ============================================================================
# 1. GENERATE 48-MONTH INCOME STATEMENT
# ============================================================================

def generate_income_statement(start_date='2021-09-01', num_months=48):
    """Generate monthly P&L for a construction company using PySpark"""

    from datetime import datetime, timedelta
    from dateutil.relativedelta import relativedelta

    # Generate date sequence
    start = datetime.strptime(start_date, '%Y-%m-%d')
    dates = [(start + relativedelta(months=i)).replace(day=1) for i in range(num_months)]

    data = []

    seasonality = {
        1: 0.85, 2: 0.88, 3: 0.95, 4: 1.05, 5: 1.10, 6: 1.12,
        7: 1.15, 8: 1.13, 9: 1.08, 10: 1.02, 11: 0.92, 12: 0.90
    }

    for i, date in enumerate(dates):
        month = date.month
        year_progress = i / 12.0

        base_monthly = BASE_ANNUAL_REVENUE / 12
        growth_factor = (1 + YOY_GROWTH_RATE) ** year_progress
        seasonal_factor = seasonality[month]

        # Using random with fixed seed
        random.seed(42 + i)
        revenue = base_monthly * growth_factor * seasonal_factor * (0.97 + random.random() * 0.06)

        cogs_rate = 0.76 + random.random() * 0.03
        cogs = revenue * cogs_rate
        gross_profit = revenue - cogs

        sg_and_a = revenue * (0.10 + random.random() * 0.02)
        sales_marketing = revenue * (0.03 + random.random() * 0.01)
        rd_expense = revenue * (0.005 + random.random() * 0.005)

        total_opex = sg_and_a + sales_marketing + rd_expense
        ebitda = gross_profit - total_opex

        depreciation = revenue * (0.015 + random.random() * 0.01)
        ebit = ebitda - depreciation

        interest_expense = revenue * (0.008 + random.random() * 0.004)
        ebt = ebit - interest_expense

        tax_rate = 0.24
        tax_expense = max(0, ebt * tax_rate)
        net_income = ebt - tax_expense

        data.append({
            'period': date.strftime('%Y-%m'),
            'fiscal_year': date.year if date.month >= 1 else date.year - 1,
            'fiscal_quarter': f"Q{(date.month-1)//3 + 1}",
            'fiscal_month': date.month,
            'revenue': round(revenue, 2),
            'cost_of_revenue': round(cogs, 2),
            'gross_profit': round(gross_profit, 2),
            'gross_margin_pct': round((gross_profit / revenue) * 100, 2),
            'sg_and_a': round(sg_and_a, 2),
            'sales_and_marketing': round(sales_marketing, 2),
            'research_and_development': round(rd_expense, 2),
            'total_operating_expenses': round(total_opex, 2),
            'ebitda': round(ebitda, 2),
            'ebitda_margin_pct': round((ebitda / revenue) * 100, 2),
            'depreciation_and_amortization': round(depreciation, 2),
            'ebit': round(ebit, 2),
            'interest_expense': round(interest_expense, 2),
            'ebt': round(ebt, 2),
            'tax_expense': round(tax_expense, 2),
            'net_income': round(net_income, 2),
            'net_margin_pct': round((net_income / revenue) * 100, 2)
        })

    schema = StructType([
        StructField('period', StringType(), True),
        StructField('fiscal_year', IntegerType(), True),
        StructField('fiscal_quarter', StringType(), True),
        StructField('fiscal_month', IntegerType(), True),
        StructField('revenue', DoubleType(), True),
        StructField('cost_of_revenue', DoubleType(), True),
        StructField('gross_profit', DoubleType(), True),
        StructField('gross_margin_pct', DoubleType(), True),
        StructField('sg_and_a', DoubleType(), True),
        StructField('sales_and_marketing', DoubleType(), True),
        StructField('research_and_development', DoubleType(), True),
        StructField('total_operating_expenses', DoubleType(), True),
        StructField('ebitda', DoubleType(), True),
        StructField('ebitda_margin_pct', DoubleType(), True),
        StructField('depreciation_and_amortization', DoubleType(), True),
        StructField('ebit', DoubleType(), True),
        StructField('interest_expense', DoubleType(), True),
        StructField('ebt', DoubleType(), True),
        StructField('tax_expense', DoubleType(), True),
        StructField('net_income', DoubleType(), True),
        StructField('net_margin_pct', DoubleType(), True)
    ])

    return spark.createDataFrame(data, schema)

income_statement = generate_income_statement()
income_statement.writeTo("jsp_demo.fin.income_statement").createOrReplace()



In [0]:
# ============================================================================
# 2. CHART OF ACCOUNTS
# ============================================================================

def generate_chart_of_accounts():
    """Create Chart of Accounts with dimensional model using PySpark"""

    accounts = []
    account_id = 1000

    account_categories = {
        'Revenue': [
            ('4100', 'Project Revenue - Commercial', 'Revenue'),
            ('4110', 'Project Revenue - Residential', 'Revenue'),
            ('4120', 'Project Revenue - Infrastructure', 'Revenue'),
            ('4130', 'Project Revenue - Industrial', 'Revenue'),
            ('4200', 'Change Order Revenue', 'Revenue'),
            ('4300', 'Service Revenue', 'Revenue'),
        ],
        'Cost of Revenue': [
            ('5100', 'Direct Labor', 'COGS'),
            ('5200', 'Materials and Supplies', 'COGS'),
            ('5300', 'Subcontractor Costs', 'COGS'),
            ('5400', 'Equipment Rental', 'COGS'),
            ('5500', 'Project Site Costs', 'COGS'),
        ],
        'Operating Expenses': [
            ('6100', 'Salaries and Wages - Admin', 'OPEX'),
            ('6110', 'Salaries and Wages - Management', 'OPEX'),
            ('6200', 'Employee Benefits', 'OPEX'),
            ('6300', 'Office Rent and Utilities', 'OPEX'),
            ('6400', 'Professional Services', 'OPEX'),
            ('6500', 'Insurance', 'OPEX'),
            ('6600', 'Marketing and Business Development', 'OPEX'),
            ('6700', 'Travel and Entertainment', 'OPEX'),
            ('6800', 'Technology and Software', 'OPEX'),
            ('6900', 'Office Supplies', 'OPEX'),
        ],
        'Assets': [
            ('1100', 'Cash and Cash Equivalents', 'Asset'),
            ('1200', 'Accounts Receivable', 'Asset'),
            ('1300', 'Inventory - Materials', 'Asset'),
            ('1400', 'Prepaid Expenses', 'Asset'),
            ('1500', 'Property Plant and Equipment', 'Asset'),
            ('1510', 'Accumulated Depreciation', 'Asset'),
            ('1600', 'Construction Equipment', 'Asset'),
            ('1610', 'Accumulated Depreciation - Equipment', 'Asset'),
        ],
        'Liabilities': [
            ('2100', 'Accounts Payable', 'Liability'),
            ('2200', 'Accrued Expenses', 'Liability'),
            ('2300', 'Deferred Revenue', 'Liability'),
            ('2400', 'Short-term Debt', 'Liability'),
            ('2500', 'Long-term Debt', 'Liability'),
        ],
        'Other': [
            ('7100', 'Depreciation Expense', 'Depreciation'),
            ('7200', 'Amortization Expense', 'Depreciation'),
            ('8100', 'Interest Expense', 'Interest'),
            ('8200', 'Interest Income', 'Interest'),
            ('9100', 'Income Tax Expense', 'Tax'),
        ]
    }

    for category, accts in account_categories.items():
        for acct_num, acct_name, acct_type in accts:
            accounts.append({
                'account_id': account_id,
                'account_number': acct_num,
                'account_name': acct_name,
                'account_type': acct_type,
                'account_category': category,
                'is_active': True
            })
            account_id += 1

    schema = StructType([
        StructField('account_id', IntegerType(), True),
        StructField('account_number', StringType(), True),
        StructField('account_name', StringType(), True),
        StructField('account_type', StringType(), True),
        StructField('account_category', StringType(), True),
        StructField('is_active', BooleanType(), True)
    ])

    return spark.createDataFrame(accounts, schema)

chart_of_accounts = generate_chart_of_accounts()
chart_of_accounts.writeTo("jsp_demo.fin.chart_of_accounts").createOrReplace()


In [0]:
# ============================================================================
# 3. COST CENTERS
# ============================================================================

def generate_cost_centers(num_centers=50):
    """Generate cost centers representing departments and divisions using PySpark"""

    cost_center_types = [
        *[('Operations', 'Project Delivery') for _ in range(15)],
        *[('Operations', 'Site Management') for _ in range(10)],
        *[('Operations', 'Equipment Management') for _ in range(5)],
        *[('Corporate', 'Finance & Accounting') for _ in range(3)],
        *[('Corporate', 'Human Resources') for _ in range(2)],
        *[('Corporate', 'Legal & Compliance') for _ in range(2)],
        *[('Corporate', 'IT & Technology') for _ in range(3)],
        *[('Sales', 'Business Development') for _ in range(4)],
        *[('Sales', 'Estimating & Bidding') for _ in range(3)],
        *[('Support', 'Procurement') for _ in range(2)],
        *[('Support', 'Quality & Safety') for _ in range(1)],
    ]

    cost_centers = []
    for i, (division, function) in enumerate(cost_center_types[:num_centers], start=1):
        cost_centers.append({
            'cost_center_id': f'CC{i:04d}',
            'cost_center_name': f'{function} {i:02d}',
            'division': division,
            'function': function,
            'is_active': True
        })

    schema = StructType([
        StructField('cost_center_id', StringType(), True),
        StructField('cost_center_name', StringType(), True),
        StructField('division', StringType(), True),
        StructField('function', StringType(), True),
        StructField('is_active', BooleanType(), True)
    ])

    return spark.createDataFrame(cost_centers, schema)

cost_centers = generate_cost_centers(50)
cost_centers.writeTo("jsp_demo.fin.cost_centers").createOrReplace()



In [0]:
# ============================================================================
# 4. PROJECTS
# ============================================================================

def generate_projects(num_projects=200):
    """Generate project master data with 80/20 rule using PySpark"""

    project_types = [
        'Commercial Office Building', 'Residential High-Rise', 'Infrastructure - Highway',
        'Infrastructure - Bridge', 'Industrial Plant', 'Hospital', 'Educational Facility',
        'Mixed-Use Development', 'Warehouse & Logistics', 'Retail Center'
    ]

    regions = ['North America - East', 'North America - West', 'Europe - North', 'Europe - South',
               'Middle East', 'Asia Pacific']

    projects = []

    # 40 large projects (80% of revenue)
    for i in range(1, 41):
        random.seed(42 + i)
        project_value = 150_000_000 + random.random() * 350_000_000
        duration_months = random.randint(18, 48)
        region = random.choice(regions)
        legal_entity = 'GB-US' if 'North America' in region or 'Middle East' in region else 'GB-EU'
        start_days = random.randint(0, 365*3)
        start_date = (datetime(2021, 1, 1) + timedelta(days=start_days)).strftime('%Y-%m-%d')
        status = 'Active' if datetime.strptime(start_date, '%Y-%m-%d') < datetime(2024, 1, 1) else 'Planning'

        projects.append({
            'project_id': f'PRJ{i:05d}',
            'project_name': f'{random.choice(project_types)} - Project {i:03d}',
            'project_type': random.choice(project_types),
            'project_size': 'Large',
            'region': region,
            'legal_entity_code': legal_entity,
            'project_value': round(project_value, 2),
            'start_date': start_date,
            'duration_months': duration_months,
            'status': status,
        })

    # 160 small projects (20% of revenue)
    for i in range(41, 201):
        random.seed(42 + i)
        project_value = 5_000_000 + random.random() * 45_000_000
        duration_months = random.randint(6, 24)
        region = random.choice(regions)
        legal_entity = 'GB-US' if 'North America' in region or 'Middle East' in region else 'GB-EU'
        start_days = random.randint(0, 365*3)
        start_date = (datetime(2021, 1, 1) + timedelta(days=start_days)).strftime('%Y-%m-%d')
        status = 'Active' if datetime.strptime(start_date, '%Y-%m-%d') < datetime(2024, 1, 1) else 'Planning'

        projects.append({
            'project_id': f'PRJ{i:05d}',
            'project_name': f'{random.choice(project_types)} - Project {i:03d}',
            'project_type': random.choice(project_types),
            'project_size': 'Small',
            'region': region,
            'legal_entity_code': legal_entity,
            'project_value': round(project_value, 2),
            'start_date': start_date,
            'duration_months': duration_months,
            'status': status,
        })

    schema = StructType([
        StructField('project_id', StringType(), True),
        StructField('project_name', StringType(), True),
        StructField('project_type', StringType(), True),
        StructField('project_size', StringType(), True),
        StructField('region', StringType(), True),
        StructField('legal_entity_code', StringType(), True),
        StructField('project_value', DoubleType(), True),
        StructField('start_date', StringType(), True),
        StructField('duration_months', IntegerType(), True),
        StructField('status', StringType(), True)
    ])

    return spark.createDataFrame(projects, schema)

projects = generate_projects(200)
projects.writeTo("jsp_demo.fin.projects").createOrReplace()
