In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.jars", "/Users/neelkalavadiya/spark-jars/iceberg-spark-runtime-3.4_2.12-1.3.1.jar,"
                           "/Users/neelkalavadiya/spark-jars/postgresql-42.7.2.jar") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()


25/04/15 13:58:18 WARN Utils: Your hostname, NEELs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.190.162.30 instead (on interface en0)
25/04/15 13:58:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/04/15 13:58:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/15 13:58:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/15 13:58:19 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [9]:
spark.sql("SHOW TABLES IN local.bronze").show(truncate=False)

+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|bronze   |santa_rosa_westover_hills|false      |
|bronze   |baptist_medical_center   |false      |
|bronze   |resolute_health          |false      |
|bronze   |santa_rosa_medical_center|false      |
|bronze   |santa_rosa_new_braunfels |false      |
|bronze   |mission_trail_baptist    |false      |
|bronze   |north_central_baptist    |false      |
+---------+-------------------------+-----------+



In [40]:
# Data Cleansing Notebook
# -------------------------------------
from pyspark.sql.functions import col, trim, current_timestamp, lit

# Step 1: Read from Bronze Table
df_bronze = spark.read.format("iceberg").load("local.bronze.north_central_baptist")
df_bronze.printSchema()

root
 |-- provider_id: string (nullable = true)
 |-- hospital_name: string (nullable = true)
 |-- hospital_address: string (nullable = true)
 |-- hospital_location: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_number: string (nullable = true)
 |-- license_state: string (nullable = true)
 |-- service_description: string (nullable = true)
 |-- code: string (nullable = true)
 |-- modifiers: string (nullable = true)
 |-- code_type: string (nullable = true)
 |-- care_setting: string (nullable = true)
 |-- gross_charge: double (nullable = true)
 |-- discounted_cash: double (nullable = true)
 |-- min_charge: long (nullable = true)
 |-- max_charge: long (nullable = true)
 |-- payer: struct (nullable = true)
 |    |-- additional_payer_notes: string (nullable = true)
 |    |-- billing_class: string (nullable = true)
 |    |-- methodology: string (nullable = true)
 |    |-- payer_name: string (nullable = true)
 |    |-- plan_name: string (nullable = true)
 

In [41]:
# Step 1: Drop records with critical nulls
df_clean = df_bronze.dropna(subset=["provider_id", "hospital_name", "service_description", "gross_charge"])

In [42]:
# Step 2: Deduplicate
df_clean = df_clean.dropDuplicates()

In [43]:
from pyspark.sql.functions import col, trim, current_timestamp, lit,upper,to_date
# Step 3: Format standardization
df_clean = df_clean \
    .withColumn("payer_name", upper(trim(col("payer_name")))) \
    .withColumn("plan_name", upper(trim(col("plan_name")))) \
    .withColumn("last_updated_on", to_date("last_updated_on", "yyyy-MM-dd"))


In [44]:
# Step 4: Remove unrealistic values
df_clean = df_clean.filter((col("gross_charge") > 0) & (col("min_charge") <= col("max_charge")))

In [45]:
df_clean.write.mode("overwrite").parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform")

                                                                                