In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.jars", "/Users/neelkalavadiya/spark-jars/iceberg-spark-runtime-3.4_2.12-1.3.1.jar,"
                           "/Users/neelkalavadiya/spark-jars/postgresql-42.7.2.jar") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()


25/04/16 10:29:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/16 10:29:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/16 10:29:58 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/04/16 10:29:58 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [27]:
spark.sql("SHOW TABLES IN local.silver").show(truncate=False)

+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|silver   |mission_trail_baptist    |false      |
|silver   |hospital_demo            |false      |
|silver   |santa_rosa_new_braunfels |false      |
|silver   |resolute_health          |false      |
|silver   |santa_rosa_westover_hills|false      |
|silver   |north_central_baptist    |false      |
|silver   |santa_rosa_medical_center|false      |
|silver   |baptist_medical_center   |false      |
+---------+-------------------------+-----------+



In [28]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import datetime

# Read from Silver Layer
silver_df = spark.read.table("local.silver.hospital_demo")

In [29]:
# Create gold timestamp
current_ts = datetime.now()
silver_df = silver_df.withColumn("gold_ingestion_ts", F.lit(current_ts))

In [30]:
# 1. Dim_Hospital
hospital_dim = silver_df.select(
    "provider_id", "hospital_name", "city", "state",
    "license_number", "license_state"
).dropDuplicates()

In [31]:
# 2. Dim_Payer
payer_dim = silver_df.select(
    "payer_name", "plan_name", "billing_class", "methodology",
    "payer_category", "pricing_model", "plan_type"
).dropDuplicates()

In [32]:
# 3. Dim_Procedure
procedure_dim = silver_df.select(
    "code", "modifiers", "code_type", "care_setting",
    "treatment_type", "is_medication", "drug_form",
    "imaging_type", "is_lab_test", "has_brand_indicator"
).dropDuplicates()

In [33]:
# Fact Table: Fact_Charges
fact_charges = silver_df.select(
    "provider_id", "payer_name", "code",
    "gross_charge", "discounted_cash", "min_charge", "max_charge",
    "standard_charge_dollar", "standard_charge_percentage",
    "charge_bucket", "payer_info_missing",
    "silver_ingestion_ts", "gold_ingestion_ts"
)

In [34]:
# Save dimensions and fact tables as Parquet for inspection
hospital_dim.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")
payer_dim.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")
procedure_dim.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")
fact_charges.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")

In [35]:
# spark.sql("DROP TABLE IF EXISTS local.gold.dim_hospital_demo")

In [36]:
from pyspark.sql.utils import AnalysisException

gold_tables = [
    (hospital_dim, "local.gold.dim_hospital_demo"),
    (payer_dim, "local.gold.dim_payer_demo"),
    (procedure_dim, "local.gold.dim_procedure_demo"),
    (fact_charges, "local.gold.fact_charges_demo")
]

for df, table in gold_tables:
    try:
        # Try to append to the existing table
        df.writeTo(table) \
            .using("iceberg") \
            .tableProperty("format-version", "2") \
            .append()
        print(f"Appended to existing table: {table}")
    except AnalysisException:
        # If table doesn't exist, create it
        df.writeTo(table) \
            .using("iceberg") \
            .tableProperty("format-version", "2") \
            .create()
        print(f"Created and loaded table: {table}")


Created and loaded table: local.gold.dim_hospital_demo
Created and loaded table: local.gold.dim_payer_demo
Created and loaded table: local.gold.dim_procedure_demo
Created and loaded table: local.gold.fact_charges_demo


In [37]:
spark.sql("SHOW TABLES IN local.gold").show(truncate=False)

+---------+------------------+-----------+
|namespace|tableName         |isTemporary|
+---------+------------------+-----------+
|gold     |dim_hospital      |false      |
|gold     |dim_payer_demo    |false      |
|gold     |dim_hospital_demo |false      |
|gold     |fact_charges_demo |false      |
|gold     |dim_payer         |false      |
|gold     |dim_procedure_demo|false      |
|gold     |dim_procedure     |false      |
|gold     |fact_charges      |false      |
+---------+------------------+-----------+



In [44]:
df_dim_check=spark.read.table("local.gold.fact_charges_demo")
df_dim_check.show(10,truncate=False)

+-----------+-------------------------------+-------------+------------+---------------+----------+----------+----------------------+--------------------------+-------------+------------------+--------------------------+--------------------------+
|provider_id|payer_name                     |code         |gross_charge|discounted_cash|min_charge|max_charge|standard_charge_dollar|standard_charge_percentage|charge_bucket|payer_info_missing|silver_ingestion_ts       |gold_ingestion_ts         |
+-----------+-------------------------------+-------------+------------+---------------+----------+----------+----------------------+--------------------------+-------------+------------------+--------------------------+--------------------------+
|1571364648 |AMERIGROUP                     |86790        |393.0       |176.85         |0.12      |65.3      |null                  |11.55                     |Unknown      |false             |2025-04-16 10:47:44.767562|2025-04-16 10:48:23.861179|
|1571364

In [45]:
jdbc_url = "jdbc:postgresql://localhost:5432/healthcare_insurance"
jdbc_properties = {
    "user": "postgres",
    "password": "201970",
    "driver": "org.postgresql.Driver"
}

df_dim_check.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "fact_charges_demo") \
    .options(**jdbc_properties) \
    .mode("overwrite") \
    .save()


                                                                                