In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

# Suppress all WARNs logs
spark.sparkContext.setLogLevel("ERROR")


# Step 2: Define the path to your JSON file
json_path = "/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse/raw_dataset/TX/SantaRosa_Hospital_MedicalCenter_SA_TX.json"

# Step 3: Read the JSON file with multiline support
df = spark.read.option("multiline", "true").json(json_path)

# Step 4: Print the schema in tree format
df.printSchema()

                                                                                

root
 |-- affirmation: struct (nullable = true)
 |    |-- affirmation: string (nullable = true)
 |    |-- confirm_affirmation: boolean (nullable = true)
 |-- hospital_address: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_location: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_name: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_information: struct (nullable = true)
 |    |-- license_number: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- modifier_information: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- code: string (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- modifier_payer_information: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- payer_nam

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode_outer
# Step 2: Explode standard_charge_information
df_exp_1 = df.select(
    col("pId").alias("provider_id"),
    col("hospital_name"),
    col("hospital_address")[0].alias("hospital_address"),
    col("hospital_location")[0].alias("hospital_location"),
    col("last_updated_on"),
    col("license_information.license_number").alias("license_number"),
    col("license_information.state").alias("license_state"),
    explode_outer("standard_charge_information").alias("sci")
)

In [4]:
# Step 3: Extract inside sci and explode code_information
df_exp_2 = df_exp_1.select(
    "*",
    col("sci.description").alias("service_description"),
    col("sci.drug_information.type").alias("drug_type"),
    col("sci.drug_information.unit").alias("drug_unit"),
    explode_outer("sci.code_information").alias("code_info"),
    col("sci.standard_charges").alias("standard_charges")
)

In [5]:
# Step 4: Extract code and explode standard_charges
df_exp_3 = df_exp_2.select(
    "*",
    col("code_info.code").alias("code"),
    col("code_info.modifiers").alias("modifiers"),
    col("code_info.type").alias("code_type"),
    explode_outer("standard_charges").alias("charge")
)

In [6]:
from pyspark.sql.functions import col, explode_outer

# Step 5: Extract final fields from exploded payer struct
df_flat = df_exp_3.select(
    "provider_id", "hospital_name", "hospital_address", "hospital_location", "last_updated_on",
    "license_number", "license_state", "service_description", "drug_type", "drug_unit",
    "code", "modifiers", "code_type",
    col("charge.setting").alias("care_setting"),
    col("charge.gross_charge"),
    col("charge.discounted_cash"),
    col("charge.minimum").alias("min_charge"),
    col("charge.maximum").alias("max_charge"),
    explode_outer("charge.payers_information").alias("payer")
).select(
    "provider_id", "hospital_name", "hospital_address", "hospital_location", "last_updated_on",
    "license_number", "license_state", "service_description", "drug_type", "drug_unit",
    "code", "modifiers", "code_type", "care_setting", "gross_charge", "discounted_cash",
    "min_charge", "max_charge",
    col("payer.payer_name").alias("payer_name"),
    col("payer.plan_name").alias("plan_name"),
    col("payer.billing_class").alias("billing_class"),
    col("payer.methodology").alias("methodology"),
    col("payer.standard_charge_dollar").alias("standard_charge_dollar"),
    col("payer.standard_charge_percentage").alias("standard_charge_percentage"),
    col("payer.additional_payer_notes").alias("additional_payer_notes"),
    col("payer.pIdx").alias("payer_id")  # Keep only valid fields
)

In [7]:
parquet_path = "/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse/checkpoint_parquet/ingestion/santa_rosa_medical_center.parquet"

df_flat = df_flat.repartition(200)
df_flat.write.mode("overwrite").parquet(parquet_path)

                                                                                

In [8]:
df_read_back = spark.read.parquet(parquet_path)

In [9]:
spark.sql("DROP TABLE IF EXISTS local.bronze.santa_rosa_medical_center")

DataFrame[]

In [10]:
df_read_back.writeTo("local.bronze.santa_rosa_medical_center") \
    .using("iceberg") \
    .tableProperty("format-version", "2") \
    .createOrReplace()

                                                                                

In [11]:
spark.sql("SHOW TABLES IN local.bronze").show(truncate=False)
spark.sql("DESCRIBE TABLE local.bronze.santa_rosa_medical_center").show()


+---------+----------------------------+-----------+
|namespace|tableName                   |isTemporary|
+---------+----------------------------+-----------+
|bronze   |santa_rosa_medical_center   |false      |
|bronze   |baptist_medical_center_sa_tx|false      |
+---------+----------------------------+-----------+

+-------------------+---------+-------+
|           col_name|data_type|comment|
+-------------------+---------+-------+
|        provider_id|   string|   null|
|      hospital_name|   string|   null|
|   hospital_address|   string|   null|
|  hospital_location|   string|   null|
|    last_updated_on|   string|   null|
|     license_number|   string|   null|
|      license_state|   string|   null|
|service_description|   string|   null|
|          drug_type|   string|   null|
|          drug_unit|   string|   null|
|               code|   string|   null|
|          modifiers|   string|   null|
|          code_type|   string|   null|
|       care_setting|   string|   null|
|

In [12]:
spark.sql("Select DISTINCT(service_description) From local.bronze.santa_rosa_medical_center LIMIT 100").show(truncate=False)



[Stage 8:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------------------------------------------------------------------+
|service_description                                                                                                                         |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|LIDOCAINE PF (XYLOCAINE MPF) 1 % INJECTION                                                                                                  |
|OTHER PROCEDURE ON LUNG AND LUNG LINING                                                                                                     |
|TEST SOCKET HEMIPELVECTOMY; PROSTHETICS, ORTHOTICS, AND SUPPLIES                                                                            |
|DETECTION OF 8 AUTOANTIBODIES IN PROSTATE TISSUE                                                                                            |

                                                                                