In [0]:
import numpy as np
import random

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.window import Window


# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# ============================================================================
# COMPANY PROFILE
# ============================================================================

COMPANY_NAME = "GlobalBuild Construction Inc."
INDUSTRY = "Construction & Engineering"
FISCAL_YEAR_END = "December 31"
BASE_ANNUAL_REVENUE = 10_000_000_000  # $10B
YOY_GROWTH_RATE = 0.05  # 5% annual growth
NUM_MONTHS = 48

# Legal Entities
LEGAL_ENTITIES = {
    'GB-US': {'name': 'GlobalBuild Americas Inc.', 'continent': 'Americas', 'currency': 'USD', 'revenue_split': 0.60},
    'GB-EU': {'name': 'GlobalBuild EMEA Ltd.', 'continent': 'EMEA', 'currency': 'EUR', 'revenue_split': 0.40}
}

In [0]:
projects = spark.read.table('jsp_demo.fin.projects')
projects.display()

# Creating Contract Data Data

Using project codes as a baseline for contracts data

*Using .collect() for ease on the small dataset*

In [0]:
# ============================================================================
# 5. CONTRACTS - INBOUND (Customer Contracts)
# ============================================================================

def generate_inbound_contracts(projects_df):
    """Generate customer contracts linked to projects using PySpark"""

    contracts = []
    contract_id = 10000

    # Collect projects to iterate (for small datasets this is acceptable)
    projects_list = projects_df.collect()

    payment_terms_options = ['Net 30', 'Net 45', 'Net 60', 'Milestone-based', 'Progress billing']

    for project in projects_list:
        contract_value = project['project_value']
        start_date = datetime.strptime(project['start_date'], '%Y-%m-%d')
        duration = project['duration_months']
        end_date = start_date + timedelta(days=duration * 30)

        random.seed(42 + contract_id)
        payment_terms = random.choice(payment_terms_options)
        status = 'Active' if start_date < datetime(2024, 1, 1) else 'Pending'

        contracts.append({
            'contract_id': f'CUST-{contract_id}',
            'contract_type': 'Inbound',
            'contract_category': 'Customer Contract',
            'project_id': project['project_id'],
            'customer_name': f"Customer_{contract_id % 100}",
            'legal_entity_code': project['legal_entity_code'],
            'contract_value': contract_value,
            'contract_start_date': start_date.strftime('%Y-%m-%d'),
            'contract_end_date': end_date.strftime('%Y-%m-%d'),
            'payment_terms': payment_terms,
            'status': status,
            'contract_description': f"Master contract for {project['project_name']}"
        })
        contract_id += 1

        # 30% chance of change orders
        random.seed(42 + contract_id)
        if random.random() < 0.30:
            change_order_value = contract_value * (0.05 + random.random() * 0.10)
            change_order_days = random.randint(90, duration * 30 - 90)
            change_order_date = start_date + timedelta(days=change_order_days)

            contracts.append({
                'contract_id': f'CUST-{contract_id}',
                'contract_type': 'Inbound',
                'contract_category': 'Change Order',
                'project_id': project['project_id'],
                'customer_name': f"Customer_{(contract_id-1) % 100}",
                'legal_entity_code': project['legal_entity_code'],
                'contract_value': round(change_order_value, 2),
                'contract_start_date': change_order_date.strftime('%Y-%m-%d'),
                'contract_end_date': end_date.strftime('%Y-%m-%d'),
                'payment_terms': payment_terms,
                'status': 'Active',
                'contract_description': f"Change order for {project['project_name']}"
            })
            contract_id += 1

    schema = StructType([
        StructField('contract_id', StringType(), True),
        StructField('contract_type', StringType(), True),
        StructField('contract_category', StringType(), True),
        StructField('project_id', StringType(), True),
        StructField('customer_name', StringType(), True),
        StructField('legal_entity_code', StringType(), True),
        StructField('contract_value', DoubleType(), True),
        StructField('contract_start_date', StringType(), True),
        StructField('contract_end_date', StringType(), True),
        StructField('payment_terms', StringType(), True),
        StructField('status', StringType(), True),
        StructField('contract_description', StringType(), True)
    ])

    return spark.createDataFrame(contracts, schema)

projects = spark.read.table('jsp_demo.fin.projects')
inbound_contracts = generate_inbound_contracts(projects)
inbound_contracts.writeTo("jsp_demo.fin.inbound_contracts").createOrReplace()


In [0]:
# ============================================================================
# 6. CONTRACTS - OUTBOUND (Vendor/Supplier Contracts)
# ============================================================================

def generate_outbound_contracts(projects_df):
    """Generate vendor/supplier contracts linked to projects using PySpark"""

    vendor_types = {
        'Materials Supplier': ['Concrete', 'Steel', 'Lumber', 'Glass', 'Electrical', 'Plumbing'],
        'Subcontractor': ['Electrical', 'HVAC', 'Plumbing', 'Roofing', 'Foundation', 'Finishing'],
        'Equipment Rental': ['Cranes', 'Excavators', 'Loaders', 'Scaffolding', 'Tools']
    }

    contracts = []
    contract_id = 20000

    projects_list = projects_df.collect()
    payment_terms_options = ['Net 30', 'Net 45', 'Net 60']

    for project in projects_list:
        project_value = project['project_value']
        start_date = datetime.strptime(project['start_date'], '%Y-%m-%d')
        duration = project['duration_months']

        random.seed(42 + contract_id)
        num_vendors = random.randint(8, 15) if project['project_size'] == 'Large' else random.randint(3, 8)

        for v in range(num_vendors):
            random.seed(42 + contract_id + v)
            vendor_category = random.choice(list(vendor_types.keys()))
            vendor_specialty = random.choice(vendor_types[vendor_category])

            if vendor_category == 'Materials Supplier':
                contract_value = project_value * (0.15 + random.random() * 0.15)
            elif vendor_category == 'Subcontractor':
                contract_value = project_value * (0.20 + random.random() * 0.20)
            else:
                contract_value = project_value * (0.03 + random.random() * 0.05)

            contract_start_days = random.randint(0, max(1, duration//4) * 30)
            contract_start = start_date + timedelta(days=contract_start_days)
            contract_duration = random.randint(max(1, duration//2), duration)
            contract_end = contract_start + timedelta(days=contract_duration * 30)

            status = 'Active' if contract_start < datetime(2024, 1, 1) else 'Pending'

            contracts.append({
                'contract_id': f'VEND-{contract_id}',
                'contract_type': 'Outbound',
                'contract_category': vendor_category,
                'project_id': project['project_id'],
                'vendor_name': f"{vendor_specialty}_{vendor_category}_{contract_id % 500}",
                'legal_entity_code': project['legal_entity_code'],
                'contract_value': round(contract_value, 2),
                'contract_start_date': contract_start.strftime('%Y-%m-%d'),
                'contract_end_date': contract_end.strftime('%Y-%m-%d'),
                'payment_terms': random.choice(payment_terms_options),
                'status': status,
                'contract_description': f"{vendor_specialty} services for {project['project_name']}"
            })
            contract_id += 1

    schema = StructType([
        StructField('contract_id', StringType(), True),
        StructField('contract_type', StringType(), True),
        StructField('contract_category', StringType(), True),
        StructField('project_id', StringType(), True),
        StructField('vendor_name', StringType(), True),
        StructField('legal_entity_code', StringType(), True),
        StructField('contract_value', DoubleType(), True),
        StructField('contract_start_date', StringType(), True),
        StructField('contract_end_date', StringType(), True),
        StructField('payment_terms', StringType(), True),
        StructField('status', StringType(), True),
        StructField('contract_description', StringType(), True)
    ])

    return spark.createDataFrame(contracts, schema)


# Usage
projects = spark.read.table('jsp_demo.fin.projects')
outbound_contracts = generate_outbound_contracts(projects)
outbound_contracts.writeTo("jsp_demo.fin.outbound_contracts").createOrReplace()