In [0]:
# Add the workspace build to the path
import sys
notebook = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
working_dir = notebook[:notebook.rfind("/")]
sys.path.append(working_dir)
# notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
# sys.path.append("/Workspace/Users/joey.filichia@databricks.com/TestWorkspaceBuild")


In [0]:
from pyspark.sql import SparkSession
import time
from workspaceSetup import create_config, run_pipeline, StageResult
from workspaceSetup.config import discover_state_directories, print_config_summary, setup_catalog_and_schemas
from workspaceSetup.bronze_ingestion import run_bronze_ingestion, BRONZE_SCHEMAS
from workspaceSetup.silver_transformations import run_all_silver_transforms
from workspaceSetup.gold_dimensions import run_gold_builds_in_order


In [0]:

config = create_config(
    catalog="healthcare_dev",
    bronze_schema="bronze", 
    silver_schema="silver",
    gold_schema="gold",
    volume_source="/Volumes/hls_bronze/hls_bronze/raw/syntheticData",
    source_system="SYNTHEA"
)

In [0]:
# create the configuration details
states = discover_state_directories(dbutils, config.volume_source)
print_config_summary(config, len(states))
setup_results = setup_catalog_and_schemas(spark, config)


PIPELINE CONFIGURATION SUMMARY
Catalog:        healthcare_dev
Bronze Schema:  healthcare_dev.bronze
Silver Schema:  healthcare_dev.silver
Gold Schema:    healthcare_dev.gold
Volume Source:  /Volumes/hls_bronze/hls_bronze/raw/syntheticData
Source System:  SYNTHEA
States Found:   16
Catalog and schemas ready: healthcare_dev.{bronze, silver, gold}


In [0]:

print("\n" + "="*60)
print("BRONZE STAGE: Raw CSV Ingestion")
print("="*60)

# do the bronze setup
start_time = time.time()

results = run_bronze_ingestion(spark, config, include_text=True)

successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
total_rows = sum(r.row_count for r in successful)
elapsed = time.time() - start_time

errors = [f"{r.table_name}: {r.error}" for r in failed]


bronze_results = StageResult(
    stage_name="bronze",
    tables_processed=len(results),
    tables_successful=len(successful),
    total_rows=total_rows,
    elapsed_seconds=round(elapsed, 2),
    success=len(failed) == 0,
    errors=errors
)


BRONZE STAGE: Raw CSV Ingestion
Ingesting CSV tables...
  Ingesting patients... OK - 1,809 rows in 4.88s
  Ingesting encounters... OK - 168,069 rows in 5.69s
  Ingesting conditions... OK - 99,734 rows in 4.61s
  Ingesting observations... OK - 1,710,624 rows in 11.9s
  Ingesting procedures... OK - 491,847 rows in 6.38s
  Ingesting medications... OK - 124,882 rows in 5.03s
  Ingesting immunizations... OK - 47,504 rows in 4.43s
  Ingesting allergies... OK - 1,714 rows in 3.83s
  Ingesting careplans... OK - 8,433 rows in 3.97s
  Ingesting devices... OK - 15,682 rows in 4.28s
  Ingesting imaging_studies... OK - 146,526 rows in 4.65s
  Ingesting supplies... OK - 72,550 rows in 3.98s
  Ingesting organizations... OK - 5,061 rows in 4.14s
  Ingesting providers... OK - 5,061 rows in 3.9s
  Ingesting payers... OK - 160 rows in 3.75s
  Ingesting claims... OK - 281,537 rows in 8.05s
  Ingesting claims_transactions... OK - 1,112,613 rows in 11.68s
  Ingesting payer_transitions... OK - 65,196 rows i

In [0]:
    print("\n" + "="*60)
    print("SILVER STAGE: FHIR Transformations")
    print("="*60)
    
    start_time = time.time()
    
    results = run_all_silver_transforms(spark, config)
    
    successful = [r for r in results if r.success]
    failed = [r for r in results if not r.success]
    total_rows = sum(r.row_count for r in successful)
    elapsed = time.time() - start_time
    
    errors = [f"{r.table_name}: {r.error}" for r in failed]
    
    silver_results = StageResult(
        stage_name="silver",
        tables_processed=len(results),
        tables_successful=len(successful),
        total_rows=total_rows,
        elapsed_seconds=round(elapsed, 2),
        success=len(failed) == 0,
        errors=errors
    )


SILVER STAGE: FHIR Transformations
Running silver transformations for 18 tables...
  Created healthcare_dev.silver.v_patient_flattened with 1,809 rows in 3.84s
  Created healthcare_dev.silver.v_encounter_flattened with 168,069 rows in 3.79s
  Created healthcare_dev.silver.v_condition_flattened with 99,734 rows in 2.96s
  Created healthcare_dev.silver.v_observation_flattened with 1,710,624 rows in 7.94s
  Created healthcare_dev.silver.v_procedure_flattened with 491,847 rows in 4.63s
  Created healthcare_dev.silver.v_medication_flattened with 124,882 rows in 2.71s
  Created healthcare_dev.silver.v_organization_flattened with 5,061 rows in 2.86s
  Created healthcare_dev.silver.v_practitioner_flattened with 5,061 rows in 3.23s
  Created healthcare_dev.silver.v_payer_flattened with 160 rows in 2.21s
  Created healthcare_dev.silver.v_allergy_flattened with 1,714 rows in 2.16s
  Created healthcare_dev.silver.v_careplan_flattened with 8,433 rows in 2.00s
  Created healthcare_dev.silver.v_devi

In [0]:

print("\n" + "="*60)
print("GOLD STAGE: Dimensional Models")
print("="*60)

start_time = time.time()

results = run_gold_builds_in_order(spark, config)

successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
total_rows = sum(r.row_count for r in successful)
elapsed = time.time() - start_time

errors = [f"{r.table_name}: {r.error}" for r in failed]

gold_results = StageResult(
    stage_name="gold",
    tables_processed=len(results),
    tables_successful=len(successful),
    total_rows=total_rows,
    elapsed_seconds=round(elapsed, 2),
    success=len(failed) == 0,
    errors=errors
)


GOLD STAGE: Dimensional Models

Building dimension tables (6 tables)...
  Created healthcare_dev.gold.dim_date with 11,323 rows in 2.06s
  Created healthcare_dev.gold.dim_patient with 1,809 rows in 2.10s
  Created healthcare_dev.gold.dim_provider with 5,061 rows in 1.99s
  Created healthcare_dev.gold.dim_organization with 5,061 rows in 2.20s
  Created healthcare_dev.gold.dim_payer with 160 rows in 1.88s
  Created healthcare_dev.gold.dim_encounter_type with 76 rows in 1.99s

Building fact tables (5 tables)...
  Created healthcare_dev.gold.fact_encounter with 168,069 rows in 2.83s
  Created healthcare_dev.gold.fact_condition with 99,734 rows in 2.56s
  Created healthcare_dev.gold.fact_medication with 124,882 rows in 2.76s
  Created healthcare_dev.gold.fact_procedure with 491,847 rows in 3.59s
  Created healthcare_dev.gold.fact_claim with 1,107,272 rows in 5.33s

Building aggregate tables (8 tables)...
  Created healthcare_dev.gold.agg_patient_summary with 1,809 rows in 3.04s
  Created h

In [0]:
                        
print("SUMMARY")
print("\n" + "="*60)
print("BRONZE")
print(bronze_results)
print("\n" + "="*60)
print("SILVER")
print(silver_results)
print("\n" + "="*60)
print("GOLD")
print(gold_results)
print("\n" + "="*60)

SUMMARY

BRONZE
StageResult(stage_name='bronze', tables_processed=21, tables_successful=21, total_rows=4765686, elapsed_seconds=558.99, success=True, errors=[])

SILVER
StageResult(stage_name='silver', tables_processed=18, tables_successful=18, total_rows=4359002, elapsed_seconds=58.1, success=True, errors=[])

GOLD
StageResult(stage_name='gold', tables_processed=19, tables_successful=19, total_rows=2023853, elapsed_seconds=49.74, success=True, errors=[])

