In [0]:
import numpy as np
import random

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType



# ----------- DataFrames for Jobs --------------- #
projects = spark.read.table('jsp_demo.fin.projects')


# Legal Entities
LEGAL_ENTITIES = {
    'GB-US': {'name': 'GlobalBuild Americas Inc.', 'continent': 'Americas', 'currency': 'USD', 'revenue_split': 0.60},
    'GB-EU': {'name': 'GlobalBuild EMEA Ltd.', 'continent': 'EMEA', 'currency': 'EUR', 'revenue_split': 0.40}
}

In [0]:
def generate_parts_master():
    """Generate parts/materials master data using PySpark"""

    part_categories = {
        'Concrete': ['Ready-Mix Concrete', 'Concrete Blocks', 'Cement Bags', 'Reinforcement Bars'],
        'Steel': ['Structural Steel Beams', 'Steel Plates', 'Rebar', 'Steel Columns'],
        'Lumber': ['2x4 Lumber', '2x6 Lumber', 'Plywood Sheets', 'Timber Beams'],
        'Electrical': ['Wiring', 'Circuit Breakers', 'Conduit', 'Junction Boxes'],
        'Plumbing': ['PVC Pipes', 'Copper Pipes', 'Valves', 'Fittings'],
        'Glass': ['Window Panels', 'Glass Doors', 'Tempered Glass', 'Glazing'],
        'Hardware': ['Bolts', 'Screws', 'Nails', 'Anchors'],
        'Equipment': ['Crane Parts', 'Excavator Parts', 'Loader Parts', 'Tools']
    }

    uom_options = ['EA', 'LB', 'FT', 'SQ FT', 'CU YD', 'ROLL']

    # Create flat list of (category, part_name) tuples with part IDs
    parts_data = []
    part_id = 1000
    for category, items in part_categories.items():
        for item in items:
            parts_data.append((part_id, category, item))
            part_id += 1

    # Create DataFrame from the list
    schema = StructType([
        StructField('part_id_num', IntegerType(), True),
        StructField('part_category', StringType(), True),
        StructField('part_name', StringType(), True)
    ])

    parts_df = spark.createDataFrame(parts_data, schema)

    # Use Spark functions to generate derived fields
    result = parts_df.withColumn(
        'part_id',
        F.concat(F.lit('PART-'), F.col('part_id_num').cast('string'))
    ).withColumn(
        'part_number',
        F.concat(F.lit('P'), F.col('part_id_num').cast('string'))
    ).withColumn(
        'unit_of_measure',
        F.element_at(
            F.array([F.lit(uom) for uom in uom_options]),
            F.abs((F.hash('part_id_num') % len(uom_options)).cast('int')) + 1
            )
    ).withColumn(
        'unit_cost',
        F.round(10 + ((F.hash(F.concat(F.lit('cost'), 'part_id_num')) % 10000) / 10000.0 * 4990), 2)
    ).withColumn(
        'reorder_point',
        ((F.hash(F.concat(F.lit('reorder'), 'part_id_num')) % 451).cast('int') + 50)
    ).withColumn(
        'is_active',
        F.lit(True)
    ).select(
        'part_id', 'part_number', 'part_name', 'part_category',
        'unit_of_measure', 'unit_cost', 'reorder_point', 'is_active'
    )

    return result

parts_master = generate_parts_master()

parts_master.display()
# parts_master.writeTo('jsp_demo.fin.parts_master').createOrReplace()



In [0]:
# ============================================================================
# 11. INVENTORY TRANSACTIONS
# ============================================================================

def generate_inventory_transactions(parts_master_df, projects_df):
    """Generate inventory movements (receipts and issues) using PySpark distributed compute"""

    # Sample parts and projects
    active_parts = parts_master_df.limit(20).select('part_id', 'part_number', 'part_name', 'unit_cost')
    active_projects = projects_df.filter(F.col('status') == 'Active').limit(50).select(
        'project_id', 'project_name', 'legal_entity_code'
    )

    legal_entities = list(LEGAL_ENTITIES.keys())

    # Generate 2000 transaction records using range
    num_transactions = 2000
    transaction_base = spark.range(num_transactions).select(
        F.col('id').cast('int').alias('txn_idx')
    ).withColumn(
        'transaction_id',
        F.concat(F.lit('INV-'), (F.col('txn_idx') + 700000).cast('string'))
    ).withColumn(
        'transaction_days',
        ((F.hash('txn_idx') % 1461).cast('int'))  # 1461 days from 2021-09-01 to 2025-08-31
    ).withColumn(
        'transaction_date',
        F.date_add(F.lit('2021-09-01'), F.col('transaction_days'))
    ).withColumn(
        'is_receipt',
        (F.hash(F.concat(F.lit('type'), 'txn_idx')) % 2) == 0
    )

    # Add part info using hash-based distribution
    parts_with_idx = active_parts.withColumn('part_idx', F.monotonically_increasing_id())
    part_count = active_parts.count()

    transaction_with_parts = transaction_base.join(
        parts_with_idx,
        (F.hash(F.concat(F.lit('part'), 'txn_idx')) % 1000).cast('long') % F.lit(part_count) ==
        (F.col('part_idx') % F.lit(part_count)),
        'inner'
    )

    # Add project info for issues
    projects_with_idx = active_projects.withColumn('proj_idx', F.monotonically_increasing_id())
    project_count = active_projects.count()

    transaction_with_projects = transaction_with_parts.join(
        projects_with_idx,
        (F.hash(F.concat(F.lit('proj'), 'txn_idx')) % 1000).cast('long') % F.lit(project_count) ==
        (F.col('proj_idx') % F.lit(project_count)),
        'left'
    )

    # Calculate quantities and create final records
    result = transaction_with_projects.withColumn(
        'transaction_type',
        F.when(F.col('is_receipt'), F.lit('Receipt')).otherwise(F.lit('Issue'))
    ).withColumn(
        'quantity_base',
        F.when(F.col('is_receipt'),
               ((F.hash(F.concat(F.lit('qty_receipt'), 'txn_idx')) % 491).cast('int') + 10))
        .otherwise(
               ((F.hash(F.concat(F.lit('qty_issue'), 'txn_idx')) % 196).cast('int') + 5))
    ).withColumn(
        'quantity',
        F.when(F.col('is_receipt'), F.col('quantity_base')).otherwise(-F.col('quantity_base'))
    ).withColumn(
        'total_value',
        F.round(F.col('quantity') * F.col('unit_cost'), 2)
    ).withColumn(
        'legal_entity_code',
        F.when(F.col('is_receipt'),
               F.element_at(
                    F.array([F.lit(le) for le in legal_entities]),
                    F.abs((F.hash(F.concat(F.lit('le'), 'txn_idx')) % len(legal_entities)).cast('int'))+1
                    ))
        .otherwise(F.col('legal_entity_code'))
    ).withColumn(
        'project_id_final',
        F.when(F.col('is_receipt'), F.lit(None).cast('string')).otherwise(F.col('project_id'))
    ).withColumn(
        'description',
        F.when(F.col('is_receipt'),
               F.concat(F.lit('Receipt of '), F.col('part_name')))
        .otherwise(
               F.concat(F.lit('Issue to '), F.col('project_name')))
    ).select(
        'transaction_id',
        'transaction_type',
        F.col('transaction_date').cast('string').alias('transaction_date'),
        'part_id',
        'part_number',
        'part_name',
        'quantity',
        'unit_cost',
        'total_value',
        F.col('project_id_final').alias('project_id'),
        'legal_entity_code',
        'description'
    )

    return result

    
inventory_transactions = generate_inventory_transactions(parts_master, projects)
inventory_transactions.writeTo('jsp_demo.fin.inventory_transactions').createOrReplace()