In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

# Suppress all WARNs logs
spark.sparkContext.setLogLevel("ERROR")

25/05/04 15:03:11 WARN Utils: Your hostname, NEELs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.190.171.120 instead (on interface en0)
25/05/04 15:03:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 15:03:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import datetime

# Read from Silver Layer
silver_df = spark.read.table("local.silver.baptist_medical_center")

In [3]:
# Create gold timestamp
current_ts = datetime.now()
silver_df = silver_df.withColumn("gold_ingestion_ts", F.lit(current_ts))

In [4]:
# 1. Dim_Hospital
hospital_dim = silver_df.select(
    "provider_id", "hospital_name", "city", "state",
    "license_number", "license_state"
).dropDuplicates()

In [5]:
# 2. Dim_Payer
payer_dim = silver_df.select(
    "payer_name", "plan_name", "billing_class", "methodology",
    "payer_category", "pricing_model", "plan_type"
).dropDuplicates()

In [6]:
# 3. Dim_Procedure
procedure_dim = silver_df.select(
    "code", "modifiers", "code_type", "care_setting",
    "treatment_type", "is_medication", "drug_form",
    "imaging_type", "is_lab_test", "has_brand_indicator"
).dropDuplicates()

In [7]:
# Fact Table: Fact_Charges
fact_charges = silver_df.select(
    "provider_id", "payer_name", "code",
    "gross_charge", "discounted_cash", "min_charge", "max_charge",
    "standard_charge_dollar", "standard_charge_percentage",
    "charge_bucket", "payer_info_missing",
    "silver_ingestion_ts", "gold_ingestion_ts"
)

In [76]:
# # Save dimensions and fact tables as Parquet for inspection
# hospital_dim.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")
# payer_dim.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")
# procedure_dim.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")
# fact_charges.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/aggregation")

                                                                                

In [8]:
from pyspark.sql.utils import AnalysisException

gold_tables = [
    (hospital_dim, "local.gold.dim_hospital"),
    (payer_dim, "local.gold.dim_payer"),
    (procedure_dim, "local.gold.dim_procedure"),
    (fact_charges, "local.gold.fact_charges")
]

for df, table in gold_tables:
    try:
        # Try to append to the existing table
        df.writeTo(table) \
            .using("iceberg") \
            .tableProperty("format-version", "2") \
            .append()
        print(f"✅ Appended to existing table: {table}")
    except AnalysisException:
        # If table doesn't exist, create it
        df.writeTo(table) \
            .using("iceberg") \
            .tableProperty("format-version", "2") \
            .create()
        print(f"🆕 Created and loaded table: {table}")


                                                                                

🆕 Created and loaded table: local.gold.dim_hospital


                                                                                

🆕 Created and loaded table: local.gold.dim_payer


                                                                                

🆕 Created and loaded table: local.gold.dim_procedure


                                                                                

🆕 Created and loaded table: local.gold.fact_charges


In [2]:
df_dim_check=spark.read.table("local.gold.dim_hospital")
df_dim_check.show(10,truncate=False)

+-----------+-------------+----+-----+--------------+-------------+
|provider_id|hospital_name|city|state|license_number|license_state|
+-----------+-------------+----+-----+--------------+-------------+
+-----------+-------------+----+-----+--------------+-------------+



In [88]:
jdbc_url = "jdbc:postgresql://localhost:5432/healthcare_insurance"
jdbc_properties = {
    "user": "postgres",
    "password": "201970",
    "driver": "org.postgresql.Driver"
}

df_dim_check.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "fact_charges") \
    .options(**jdbc_properties) \
    .mode("overwrite") \
    .save()


                                                                                

In [2]:
spark.sql("SHOW TABLES IN local.gold").show(truncate=False)

                                                                                

+---------+-----------------+-----------+
|namespace|tableName        |isTemporary|
+---------+-----------------+-----------+
|gold     |dim_plan         |false      |
|gold     |dim_hospital     |false      |
|gold     |dim_payer        |false      |
|gold     |dim_plan_metadata|false      |
|gold     |dim_procedure    |false      |
|gold     |fact_charges     |false      |
+---------+-----------------+-----------+



In [3]:
spark.sql("Select * from local.gold.dim_hospital LIMIT 5").show(truncate=False)

                                                                                

+-----------+----------------------+-----------+-----+--------------+-------------+
|hospital_id|hospital_name         |city       |state|license_number|license_state|
+-----------+----------------------+-----------+-----+--------------+-------------+
|1514675685 |Baptist Medical Center|San Antonio|TX   |000114        |TX           |
+-----------+----------------------+-----------+-----+--------------+-------------+



In [5]:
spark.sql("Select * from local.gold.dim_payer LIMIT 20").show(truncate=False)

+--------+----------------------------------------+
|payer_id|payer_name                              |
+--------+----------------------------------------+
|1001    |AETNA                                   |
|1002    |ANTHEM AFFILIATES                       |
|1003    |BLUE CROSS BLUE SHIELD OF TEXAS         |
|1004    |CENTENE                                 |
|1005    |CIGNA                                   |
|1006    |COMMUNITY FIRST HEALTH PLAN             |
|1007    |DEVOTED HEALTH PLAN                     |
|1008    |HUMANA                                  |
|1009    |IMAGINE HEALTH                          |
|1010    |MOLINA HEALTHCARE OF TEXAS (CLAIMS ONLY)|
|1011    |OPTUMCARE                               |
|1012    |OSCAR HEALTH                            |
|1013    |PROVIDER PARTNERS HEALTH PLAN           |
|1014    |TEXAS INDEPENDENCE HEALTH PLAN          |
|1015    |UNITED HEALTHCARE                       |
|1016    |WELLCARE                                |
+--------+--