In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.jars", "/Users/neelkalavadiya/spark-jars/iceberg-spark-runtime-3.4_2.12-1.3.1.jar,"
                           "/Users/neelkalavadiya/spark-jars/postgresql-42.7.2.jar") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()



# Step 2: Define the path to your JSON file
json_path = "/Users/neelkalavadiya/iceberg_warehouse/raw_dataset/TX/Baptist_Medical_Center.json"

# Step 3: Read the JSON file with multiline support
df = spark.read.option("multiline", "true").json(json_path)

# Step 4: Print the schema in tree format
df.printSchema()

[Stage 12:>                                                         (0 + 1) / 1]

root
 |-- affirmation: struct (nullable = true)
 |    |-- affirmation: string (nullable = true)
 |    |-- confirm_affirmation: boolean (nullable = true)
 |-- hospital_address: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_location: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_name: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_information: struct (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- pId: string (nullable = true)
 |-- standard_charge_information: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- code_information: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- modifiers: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true

                                                                                

In [16]:
#Explode standard_charge_information
from pyspark.sql.functions import col, explode

df_exploded = df.select(
    col("pId").alias("provider_id"),
    col("hospital_name"),
    col("hospital_address")[0].alias("hospital_address"),
    col("hospital_location")[0].alias("hospital_location"),
    col("last_updated_on"),
    col("license_information.license").alias("license_number"),
    col("license_information.state").alias("license_state"),
    explode("standard_charge_information").alias("sci")
)

In [17]:
#Explode code_information
df_code = df_exploded.select(
    "*",
    col("sci.description").alias("service_description"),
    explode("sci.code_information").alias("code_info"),
    col("sci.standard_charges").alias("charge_arr")
)

In [18]:
#Explode standard_charges
df_charges = df_code.select(
    "*",
    col("code_info.code").alias("code"),
    col("code_info.modifiers").alias("modifiers"),
    col("code_info.type").alias("code_type"),
    explode("charge_arr").alias("charge")
)

In [19]:
#Explode payers_information
df_final_flat = df_charges.select(
    "provider_id", "hospital_name", "hospital_address", "hospital_location",
    "last_updated_on", "license_number", "license_state", "service_description",
    "code", "modifiers", "code_type",
    col("charge.setting").alias("care_setting"),
    col("charge.gross_charge"),
    col("charge.discounted_cash"),
    col("charge.minimum").alias("min_charge"),
    col("charge.maximum").alias("max_charge"),
    explode("charge.payers_information").alias("payer")
).select(
    "*",
    col("payer.payer_name"),
    col("payer.plan_name"),
    col("payer.billing_class"),
    col("payer.methodology"),
    col("payer.standard_charge_dollar"),
    col("payer.standard_charge_percentage"),
    col("payer.additional_payer_notes"),
    col("payer.standard_charge_algorithm")
)

In [20]:
parquet_path = "/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/ingestion/baptist_medical_center_SA.parquet"
df_final_flat.write.mode("overwrite").parquet(parquet_path)


                                                                                

In [8]:
df_read_back = spark.read.parquet(parquet_path)

In [9]:
spark.sql("DROP TABLE IF EXISTS local.bronze.baptist_medical_center")


DataFrame[]

In [10]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS local.bronze")


DataFrame[]

In [13]:
df_read_back.writeTo("local.bronze.baptist_medical_center") \
    .using("iceberg") \
    .tableProperty("format-version", "2") \
    .createOrReplace()


                                                                                

In [14]:
spark.sql("SHOW TABLES IN local.bronze").show(truncate=False)
spark.sql("DESCRIBE TABLE local.bronze.baptist_medical_center").show()


+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|bronze   |santa_rosa_westover_hills|false      |
|bronze   |baptist_medical_center   |false      |
|bronze   |resolute_health          |false      |
|bronze   |santa_rosa_medical_center|false      |
|bronze   |santa_rosa_new_braunfels |false      |
|bronze   |mission_trail_baptist    |false      |
|bronze   |north_central_baptist    |false      |
+---------+-------------------------+-----------+

+-------------------+--------------------+-------+
|           col_name|           data_type|comment|
+-------------------+--------------------+-------+
|        provider_id|              string|   null|
|      hospital_name|              string|   null|
|   hospital_address|              string|   null|
|  hospital_location|              string|   null|
|    last_updated_on|              string|   null|
|     license_number|              string