In [0]:
# Add the workspace build to the path
import sys
notebook = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
working_dir = notebook[:notebook.rfind("/")]
sys.path.append(working_dir)
# notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
# sys.path.append("/Workspace/Users/joey.filichia@databricks.com/TestWorkspaceBuild")


In [0]:
from pyspark.sql import SparkSession
import time
from workspaceSetup import create_config, run_pipeline, StageResult, run_cms_builds_in_order
from workspaceSetup.config import discover_state_directories, print_config_summary, setup_catalog_and_schemas
from workspaceSetup.bronze_ingestion import run_bronze_ingestion, BRONZE_SCHEMAS
from workspaceSetup.silver_transformations import run_all_silver_transforms
from workspaceSetup.gold_dimensions import run_gold_builds_in_order
from workspaceSetup import (
    run_reference_ingestion,
    create_reference_config,
    verify_reference_tables
)


In [0]:

config = create_config(
    catalog="healthcare_dev",
    bronze_schema="bronze", 
    silver_schema="silver",
    gold_schema="gold",
    volume_source="/Volumes/hls_bronze/hls_bronze/raw/syntheticData",
    source_system="SYNTHEA"
)

In [0]:
# create the configuration details
states = discover_state_directories(dbutils, config.volume_source)
print_config_summary(config, len(states))
setup_results = setup_catalog_and_schemas(spark, config)

In [0]:
ref_config = create_reference_config(
    catalog="healthcare_dev",
    schema="bronze",
    hcpcs_volume="/Volumes/hls_bronze/hls_bronze/raw/reference/hcpcs",
    icd10cm_volume="/Volumes/hls_bronze/hls_bronze/raw/reference/icd10cm",
    ndc_volume="/Volumes/hls_bronze/hls_bronze/raw/reference/ndc",
    snomed_icd10cm_volume="/Volumes/hls_bronze/hls_bronze/raw/reference/snomed_icd10cm",
    rxnorm_volume="/Volumes/hls_bronze/hls_bronze/raw/reference/rxnorm",
    rxnorm_prescribe_volume="/Volumes/hls_bronze/hls_bronze/raw/reference/rxnorm_prescribe"
)

In [0]:
results = run_reference_ingestion(spark, ref_config)

In [0]:

print("\n" + "="*60)
print("BRONZE STAGE: Raw CSV Ingestion")
print("="*60)

# do the bronze setup
start_time = time.time()

results = run_bronze_ingestion(spark, config, include_text=True)

successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
total_rows = sum(r.row_count for r in successful)
elapsed = time.time() - start_time

errors = [f"{r.table_name}: {r.error}" for r in failed]


bronze_results = StageResult(
    stage_name="bronze",
    tables_processed=len(results),
    tables_successful=len(successful),
    total_rows=total_rows,
    elapsed_seconds=round(elapsed, 2),
    success=len(failed) == 0,
    errors=errors
)

In [0]:
    print("\n" + "="*60)
    print("SILVER STAGE: FHIR Transformations")
    print("="*60)
    
    start_time = time.time()
    
    results = run_all_silver_transforms(spark, config)
    
    successful = [r for r in results if r.success]
    failed = [r for r in results if not r.success]
    total_rows = sum(r.row_count for r in successful)
    elapsed = time.time() - start_time
    
    errors = [f"{r.table_name}: {r.error}" for r in failed]
    
    silver_results = StageResult(
        stage_name="silver",
        tables_processed=len(results),
        tables_successful=len(successful),
        total_rows=total_rows,
        elapsed_seconds=round(elapsed, 2),
        success=len(failed) == 0,
        errors=errors
    )

In [0]:

print("\n" + "="*60)
print("GOLD STAGE: Dimensional Models")
print("="*60)

start_time = time.time()

results = run_gold_builds_in_order(spark, config)

successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
total_rows = sum(r.row_count for r in successful)
elapsed = time.time() - start_time

errors = [f"{r.table_name}: {r.error}" for r in failed]

gold_results = StageResult(
    stage_name="gold",
    tables_processed=len(results),
    tables_successful=len(successful),
    total_rows=total_rows,
    elapsed_seconds=round(elapsed, 2),
    success=len(failed) == 0,
    errors=errors
)

In [0]:
print("\n" + "="*60)
print("GOLD CMS STAGE: Dimensional Models")
print("="*60)

start_time = time.time()

results = run_cms_builds_in_order(spark, config)

successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
total_rows = sum(r.row_count for r in successful)
elapsed = time.time() - start_time

errors = [f"{r.table_name}: {r.error}" for r in failed]

gold_results = StageResult(
    stage_name="gold",
    tables_processed=len(results),
    tables_successful=len(successful),
    total_rows=total_rows,
    elapsed_seconds=round(elapsed, 2),
    success=len(failed) == 0,
    errors=errors
)

In [0]:
                        
print("SUMMARY")
print("\n" + "="*60)
print("BRONZE")
print(bronze_results)
print("\n" + "="*60)
print("SILVER")
print(silver_results)
print("\n" + "="*60)
print("GOLD")
print(gold_results)
print("\n" + "="*60)