In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

# Suppress all WARNs logs
spark.sparkContext.setLogLevel("ERROR")

25/05/06 10:23:51 WARN Utils: Your hostname, NEELs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.190.186.140 instead (on interface en0)
25/05/06 10:23:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/06 10:23:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
csv_path = "/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse/raw_dataset/TX/vernon_memorial.csv"

df = spark.read.option("header", True).option("inferSchema", True).csv(csv_path)

df.printSchema()
df.show()

root
 |-- hospital_name: string (nullable = true)
 |-- provider_id: integer (nullable = true)
 |-- license_number: integer (nullable = true)
 |-- license_state: string (nullable = true)
 |-- hospital_address: string (nullable = true)
 |-- last_updated_on: date (nullable = true)
 |-- description: string (nullable = true)
 |-- code | 1: string (nullable = true)
 |-- code | 1 | type: string (nullable = true)
 |-- code | 2: integer (nullable = true)
 |-- code | 2 | type: string (nullable = true)
 |-- code | 3: integer (nullable = true)
 |-- code | 3 | type: string (nullable = true)
 |-- code | 4: string (nullable = true)
 |-- code | 4 | type: string (nullable = true)
 |-- code | 5: string (nullable = true)
 |-- code | 5 | type: string (nullable = true)
 |-- code | 6: integer (nullable = true)
 |-- code | 6 | type: string (nullable = true)
 |-- setting: string (nullable = true)
 |-- drug_unit_of_measurement: string (nullable = true)
 |-- drug_type_of_measurement: string (nullable = true)
 |

In [7]:
df.show()

+--------------------+-----------+--------------+-------------+--------------------+---------------+--------------------+--------+---------------+--------+---------------+--------+---------------+--------+---------------+--------+---------------+--------+---------------+---------+------------------------+------------------------+-----------------------+---------------------------------+--------------------+--------------------+---------+-----------------------------------+---------------------------------------+--------------------------------------+----------------+---------------------+---------------------+-----------------------------+------------------------+-------------+
|       hospital_name|provider_id|license_number|license_state|    hospital_address|last_updated_on|         description|code | 1|code | 1 | type|code | 2|code | 2 | type|code | 3|code | 3 | type|code | 4|code | 4 | type|code | 5|code | 5 | type|code | 6|code | 6 | type|  setting|drug_unit_of_measurement|drug_ty

In [6]:
from pyspark.sql.functions import col, lit

aligned_df = df.select(
    col("provider_id").cast("string").alias("hospital_id"),
    col("hospital_name"),
    col("hospital_address"),
    lit(None).cast("string").alias("hospital_location"),  # Not present in CSV
    col("last_updated_on").cast("string"),
    col("license_number").cast("string"),
    col("license_state"),
    col("description").alias("service_description"),
    col("code | 1").alias("code"),
    col("modifiers").cast("string"),
    col("code | 1 | type").alias("code_type"),
    col("setting").alias("care_setting"),
    col("standard_charge | gross").alias("gross_charge"),
    col("standard_charge | discounted_cash").alias("discounted_cash"),
    col("standard_charge | min").cast("long").alias("min_charge"),
    col("standard_charge | max").cast("long").alias("max_charge"),
    col("payer_name"),
    col("plan_name"),
    col("billing_class"),
    col("standard_charge | methodology").alias("methodology"),
    col("standard_charge | negotiated_dollar").cast("long").alias("standard_charge_dollar"),
    col("standard_charge | negotiated_percentage").cast("long").alias("standard_charge_percentage"),
    col("additional_generic_notes").alias("additional_payer_notes"),
    col("standard_charge | negotiated_algorithm").alias("standard_charge_algorithm")
)

aligned_df.printSchema()
aligned_df.show(truncate=False)

root
 |-- hospital_id: string (nullable = true)
 |-- hospital_name: string (nullable = true)
 |-- hospital_address: string (nullable = true)
 |-- hospital_location: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_number: string (nullable = true)
 |-- license_state: string (nullable = true)
 |-- service_description: string (nullable = true)
 |-- code: string (nullable = true)
 |-- modifiers: string (nullable = true)
 |-- code_type: string (nullable = true)
 |-- care_setting: string (nullable = true)
 |-- gross_charge: double (nullable = true)
 |-- discounted_cash: double (nullable = true)
 |-- min_charge: long (nullable = true)
 |-- max_charge: long (nullable = true)
 |-- payer_name: string (nullable = true)
 |-- plan_name: string (nullable = true)
 |-- billing_class: string (nullable = true)
 |-- methodology: string (nullable = true)
 |-- standard_charge_dollar: long (nullable = true)
 |-- standard_charge_percentage: long (nullable = true)
 |-- addi