In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

# Suppress all WARNs logs
spark.sparkContext.setLogLevel("ERROR")

# Read the JSON
json_path = "/Users/neelkalavadiya/iceberg_warehouse/raw_dataset/TX/Baptist_Medical_Center.json"
df = spark.read.option("multiline", "true").json(json_path)
df.printSchema()

25/04/21 11:25:57 WARN Utils: Your hostname, NEELs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.196.170.202 instead (on interface en0)
25/04/21 11:25:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/21 11:25:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/21 11:25:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/21 11:25:58 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
                                                                                

root
 |-- affirmation: struct (nullable = true)
 |    |-- affirmation: string (nullable = true)
 |    |-- confirm_affirmation: boolean (nullable = true)
 |-- hospital_address: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_location: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_name: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_information: struct (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- pId: string (nullable = true)
 |-- standard_charge_information: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- code_information: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- modifiers: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true

In [2]:
from pyspark.sql.functions import col, explode, explode_outer

def flatten_healthcare_json(df):
    """
    Flattens healthcare JSON with a consistent final schema with below 23 columns.
    +--------------------------+---------+-------+
    |col_name                  |data_type|comment|
    +--------------------------+---------+-------+
    |provider_id               |string   |null   |
    |hospital_name             |string   |null   |
    |hospital_address          |string   |null   |
    |hospital_location         |string   |null   |
    |last_updated_on           |string   |null   |
    |license_number            |string   |null   |
    |license_state             |string   |null   |
    |service_description       |string   |null   |
    |code                      |string   |null   |
    |modifiers                 |string   |null   |
    |code_type                 |string   |null   |
    |care_setting              |string   |null   |
    |gross_charge              |double   |null   |
    |discounted_cash           |double   |null   |
    |min_charge                |bigint   |null   |
    |max_charge                |bigint   |null   |
    |payer_name                |string   |null   |
    |plan_name                 |string   |null   |
    |billing_class             |string   |null   |
    |methodology               |string   |null   |
    |standard_charge_dollar    |bigint   |null   |
    |standard_charge_percentage|bigint   |null   |
    |additional_payer_notes    |string   |null   |
    +--------------------------+---------+-------+
    Handles variations in license field names and missing payer fields.

    Args:
        df: Raw Spark DataFrame loaded from JSON.
    Returns:
        Flattened Spark DataFrame.
    """
    # Check license field name
    license_fields = df.select("license_information.*").schema.names
    license_col = "license" if "license" in license_fields else "license_number"

    # Explode standard_charge_information
    df_sci = df.select(
        col("pId").alias("provider_id"),
        col("hospital_name"),
        col("hospital_address")[0].alias("hospital_address"),
        col("hospital_location")[0].alias("hospital_location"),
        col("last_updated_on"),
        col(f"license_information.{license_col}").alias("license_number"),
        col("license_information.state").alias("license_state"),
        explode("standard_charge_information").alias("sci")
    )

    # Explode code_information
    df_code = df_sci.select(
        "*",
        col("sci.description").alias("service_description"),
        explode("sci.code_information").alias("code_info"),
        col("sci.standard_charges").alias("standard_charges")
    )

    # Explode standard_charges
    df_charges = df_code.select(
        "*",
        col("code_info.code").alias("code"),
        col("code_info.modifiers").alias("modifiers"),
        col("code_info.type").alias("code_type"),
        explode("standard_charges").alias("charge")
    )

    # Explode payers_information
    df_payer = df_charges.select(
        "*",
        col("charge.setting").alias("care_setting"),
        col("charge.gross_charge"),
        col("charge.discounted_cash"),
        col("charge.minimum").alias("min_charge"),
        col("charge.maximum").alias("max_charge"),
        explode_outer("charge.payers_information").alias("payer")
    )

    # Final flattening
    df_final_flat = df_payer.select(
        "provider_id", "hospital_name", "hospital_address", "hospital_location",
        "last_updated_on", "license_number", "license_state", "service_description",
        "code", "modifiers", "code_type", "care_setting", "gross_charge",
        "discounted_cash", "min_charge", "max_charge",
        col("payer.payer_name"),
        col("payer.plan_name"),
        col("payer.billing_class"),
        col("payer.methodology"),
        col("payer.standard_charge_dollar"),
        col("payer.standard_charge_percentage"),
        col("payer.additional_payer_notes")
    )

    return df_final_flat


In [3]:
# Paths
raw_dir = "/home/jovyan/iceberg_warehouse/raw_data/automate_test/"
json_files = [f for f in os.listdir(raw_dir) if f.endswith(".json")]

def create_spark_session():
    return SparkSession.builder \
        .appName("Healthcare Export") \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type", "hadoop") \
        .config("spark.sql.catalog.local.warehouse", "/home/jovyan/iceberg_warehouse") \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .getOrCreate()

def process_json_file(file, spark):
    try:
        print(f"\n🚀 Processing {file}")
        path = os.path.join(raw_dir, file)
        df_raw = spark.read.option("multiline", "true").json(path)
        df_flat = flatten_healthcare_json(df_raw)

        hospital_name = file.replace(".json", "").lower().replace(" ", "_").replace("-", "_")
        table_name = f"local.bronze.{hospital_name}"
        parquet_path = f"/home/jovyan/iceberg_warehouse/checkpoint_parquet/ingestion/{hospital_name}.parquet"

        print(f"✅ Writing to Parquet format: {table_name}")
        df_flat.write.mode("overwrite").parquet(parquet_path)

        spark.sql(f"DROP TABLE IF EXISTS {table_name}")
        df_back = spark.read.parquet(parquet_path)

        print(f"✅ Writing to iceberg table: {table_name}")
        df_back.writeTo(table_name) \
            .using("iceberg") \
            .tableProperty("format-version", "2") \
            .createOrReplace()

        print(f"✅ Done: {table_name}")
        return True
    except Exception as e:
        print(f"❌ Error processing {file}: {e}")
        return False

In [5]:
spark.sql("SHOW TABLES IN local.bronze").show(truncate=False)
spark.sql("DESCRIBE TABLE local.bronze.baptist_medical_center").show(30, truncate=False)

+---------+-----------------------------------------+-----------+
|namespace|tableName                                |isTemporary|
+---------+-----------------------------------------+-----------+
|bronze   |baptist_medical_center                   |false      |
|bronze   |santarosahospitalmedicalcenter_sanantonio|false      |
+---------+-----------------------------------------+-----------+

+--------------------------+---------+-------+
|col_name                  |data_type|comment|
+--------------------------+---------+-------+
|provider_id               |string   |null   |
|hospital_name             |string   |null   |
|hospital_address          |string   |null   |
|hospital_location         |string   |null   |
|last_updated_on           |string   |null   |
|license_number            |string   |null   |
|license_state             |string   |null   |
|service_description       |string   |null   |
|code                      |string   |null   |
|modifiers                 |string   |n

In [5]:
spark.sql("SHOW TABLES IN local.bronze").show(truncate=False)

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [24]:
spark.sql("Select * from local.bronze.baptist_medical_center").show(1, truncate=False)
spark.sql("Select * from local.bronze.santarosahospitalmedicalcenter_sanantonio").show(1, truncate=False)

+-----------+----------------------+------------------------------------+----------------------+---------------+--------------+-------------+-------------------+-----+---------+---------+------------+------------+---------------+----------+----------+----------+-----------------------+-------------+------------+----------------------+--------------------------+----------------------+
|provider_id|hospital_name         |hospital_address                    |hospital_location     |last_updated_on|license_number|license_state|service_description|code |modifiers|code_type|care_setting|gross_charge|discounted_cash|min_charge|max_charge|payer_name|plan_name              |billing_class|methodology |standard_charge_dollar|standard_charge_percentage|additional_payer_notes|
+-----------+----------------------+------------------------------------+----------------------+---------------+--------------+-------------+-------------------+-----+---------+---------+------------+------------+-------------

In [4]:
spark.sql(f"DROP TABLE IF EXISTS local.bronze.missiontrailbaptisthospital_sanantonio")

DataFrame[]