In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.jars", "/Users/neelkalavadiya/spark-jars/iceberg-spark-runtime-3.4_2.12-1.3.1.jar,"
                           "/Users/neelkalavadiya/spark-jars/postgresql-42.7.2.jar") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()


25/04/16 10:29:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/16 10:29:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/16 10:29:45 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [20]:
from pyspark.sql.functions import col, when, lit, current_timestamp

# Load the cleaned parquet (output from Notebook 1)
df_cleaned = spark.read.parquet("/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform")

In [21]:
# --- Enrichment --- #

# Add ingestion timestamp
df_enriched = df_cleaned.withColumn("silver_ingestion_ts", current_timestamp())

In [22]:
# Classify payer category
from pyspark.sql.functions import when

payer_category_expr = when(col("payer_name").rlike("(?i)aetna|cigna|humana|anthem|imagine health|blue cross"), "Commercial") \
    .when(col("payer_name").rlike("(?i)medicare|provider partners|devoted"), "Medicare") \
    .when(col("payer_name").rlike("(?i)medicaid|community first|molina"), "Medicaid") \
    .when(col("plan_name").rlike("(?i)HIX|exchange|blue advantage"), "Exchange") \
    .otherwise("Other")

df_enriched = df_enriched.withColumn("payer_category", payer_category_expr)

In [23]:
#Classify Pricing Model
df_enriched = df_enriched.withColumn(
    "pricing_model",
    when(col("methodology").rlike("(?i)case rate"), "Case Rate")
    .when(col("methodology").rlike("(?i)fee schedule"), "Fee Schedule")
    .when(col("methodology").rlike("(?i)percent|percentage"), "Percentage-Based")
    .otherwise("Other")
)

In [24]:
# Extract Plan Type
df_enriched = df_enriched.withColumn(
    "plan_type",
    when(col("plan_name").rlike("(?i)hmo"), "HMO")
    .when(col("plan_name").rlike("(?i)ppo"), "PPO")
    .when(col("plan_name").rlike("(?i)hix|exchange"), "Exchange")
    .when(col("plan_name").rlike("(?i)medicare|medicaid"), "Government")
    .otherwise("Other"))

In [25]:
# Bucket Standard Charges
df_enriched = df_enriched.withColumn(
    "charge_bucket",
    when(col("standard_charge_dollar") < 100, "Low")
    .when(col("standard_charge_dollar").between(100, 1000), "Medium")
    .when(col("standard_charge_dollar") > 1000, "High")
    .otherwise("Unknown")
)

In [26]:
#Add Missing Payer Info Flag
df_enriched = df_enriched.withColumn(
    "payer_info_missing",
    when(
        col("payer_name").isNull() |
        col("plan_name").isNull() |
        col("methodology").isNull(),
        True
    ).otherwise(False)
)

In [27]:
# Classify treatment types from service description (simplified rules)
from pyspark.sql.functions import col, when, regexp_extract, lower

# Treatment type based on keyword match
df_enriched = df_enriched.withColumn(
    "treatment_type",
    when(lower(col("service_description")).rlike("mri|ct|x-ray|ultrasound|imaging"), "Imaging")
    .when(lower(col("service_description")).rlike("injection|inj|tablet|tb|cp|oral|syrup|mg|solution|suspension"), "Medication")
    .when(lower(col("service_description")).rlike("biopsy|surgery|resection|repair|ablation|implant|arthroplasty|graft"), "Procedure")
    .when(lower(col("service_description")).rlike("panel|ab/|antibody|lab|urine|test|analysis|level|quant"), "Lab Test")
    .when(lower(col("service_description")).rlike("device|supply|graft|stent|pump|dressing"), "Supply/Device")
    .otherwise("Other")
)

# is_medication flag
df_enriched = df_enriched.withColumn(
    "is_medication",
    when(lower(col("service_description")).rlike("mg|tb|cp|solution|suspension|syrup|inhalation|injection"), True).otherwise(False)
)

# Drug form (simple classification)
df_enriched = df_enriched.withColumn(
    "drug_form",
    when(lower(col("service_description")).rlike("tb|tablet|cp"), "Tablet")
    .when(lower(col("service_description")).rlike("inj|injection|ij"), "Injection")
    .when(lower(col("service_description")).rlike("sol|solution"), "Solution")
    .when(lower(col("service_description")).rlike("cream|ointment"), "Topical")
    .when(lower(col("service_description")).rlike("inhalation|ih|is"), "Inhaler")
    .otherwise("Other")
)

# Imaging Type (subset of treatment_type)
df_enriched = df_enriched.withColumn(
    "imaging_type",
    when(lower(col("service_description")).rlike("mri"), "MRI")
    .when(lower(col("service_description")).rlike("ct"), "CT Scan")
    .when(lower(col("service_description")).rlike("x-ray"), "X-Ray")
    .when(lower(col("service_description")).rlike("ultrasound"), "Ultrasound")
    .otherwise(None)
)

# Lab test flag
df_enriched = df_enriched.withColumn(
    "is_lab_test",
    when(lower(col("service_description")).rlike("panel|antibody|test|level|urine|blood|cbc|cmp|lipid"), True).otherwise(False)
)

# Brand indicator (recognizing known brand/device names)
df_enriched = df_enriched.withColumn(
    "has_brand_indicator",
    when(lower(col("service_description")).rlike("stryker|depuy|bard|zimmer|philips|medtronic|covidien|smith"), True).otherwise(False)
)

In [28]:
from pyspark.sql.functions import split, trim, regexp_extract, col

# Split by comma
df_address_split = df_enriched.withColumn("street", trim(split("hospital_address", ",")[0])) \
    .withColumn("city", trim(split("hospital_address", ",")[1])) \
    .withColumn("state_zip", trim(split("hospital_address", ",")[2]))

# Further extract state and ZIP from state_zip
df_address_cleaned = df_address_split \
    .withColumn("state", regexp_extract("state_zip", r"([A-Z]{2})", 1)) \
    .withColumn("zip_code", regexp_extract("state_zip", r"(\d{5})", 1)) \
    .drop("state_zip")

# Optional: select only desired columns to verify
df_address_cleaned.select("hospital_address", "street", "city", "state", "zip_code").show(5, truncate=False)

+----------------------------------------+---------------+-------------+-----+--------+
|hospital_address                        |street         |city         |state|zip_code|
+----------------------------------------+---------------+-------------+-----+--------+
|600 N Union Ave, New Braunfels, TX 78130|600 N Union Ave|New Braunfels|TX   |78130   |
|600 N Union Ave, New Braunfels, TX 78130|600 N Union Ave|New Braunfels|TX   |78130   |
|600 N Union Ave, New Braunfels, TX 78130|600 N Union Ave|New Braunfels|TX   |78130   |
|600 N Union Ave, New Braunfels, TX 78130|600 N Union Ave|New Braunfels|TX   |78130   |
|600 N Union Ave, New Braunfels, TX 78130|600 N Union Ave|New Braunfels|TX   |78130   |
+----------------------------------------+---------------+-------------+-----+--------+
only showing top 5 rows



In [29]:
# Write to Parquet (Intermediate Step)
parquet_path = "/Users/neelkalavadiya/iceberg_warehouse/checkpoint_parquet/transform/enrichment"
df_address_cleaned.write.mode("overwrite").parquet(parquet_path)

                                                                                

In [30]:
df_check=spark.read.parquet(parquet_path)

In [31]:
df_check.printSchema()

root
 |-- provider_id: string (nullable = true)
 |-- hospital_name: string (nullable = true)
 |-- hospital_address: string (nullable = true)
 |-- hospital_location: string (nullable = true)
 |-- last_updated_on: date (nullable = true)
 |-- license_number: string (nullable = true)
 |-- license_state: string (nullable = true)
 |-- service_description: string (nullable = true)
 |-- drug_type: string (nullable = true)
 |-- drug_unit: string (nullable = true)
 |-- code: string (nullable = true)
 |-- modifiers: string (nullable = true)
 |-- code_type: string (nullable = true)
 |-- care_setting: string (nullable = true)
 |-- gross_charge: double (nullable = true)
 |-- discounted_cash: double (nullable = true)
 |-- min_charge: double (nullable = true)
 |-- max_charge: double (nullable = true)
 |-- payer_name: string (nullable = true)
 |-- plan_name: string (nullable = true)
 |-- billing_class: string (nullable = true)
 |-- methodology: string (nullable = true)
 |-- standard_charge_dollar: doub

In [32]:
from pyspark.sql.functions import col

# Step 1: Read the enriched parquet file
df = spark.read.parquet(parquet_path)

# Step 2: Define columns to retain
columns_to_keep = [
    "provider_id", "hospital_name", "city", "state", "last_updated_on", "license_number", 
    "license_state", "code", "modifiers", "code_type", "care_setting",
    "gross_charge", "discounted_cash", "min_charge", "max_charge",
    "payer_name", "plan_name", "billing_class", "methodology",
    "standard_charge_dollar", "standard_charge_percentage",
    "silver_ingestion_ts", "payer_category", "pricing_model", "plan_type",
    "charge_bucket", "payer_info_missing", "treatment_type", "is_medication",
    "drug_form", "imaging_type", "is_lab_test", "has_brand_indicator"
]

# Step 3: Determine columns to drop
all_columns = df.columns
columns_to_drop = list(set(all_columns) - set(columns_to_keep))

print("Dropping the following unnecessary columns:")
for col_name in columns_to_drop:
    print(f" - {col_name}")

# Step 4: Drop and create final DataFrame
df_cleaned = df.select([col(c) for c in columns_to_keep])


Dropping the following unnecessary columns:
 - drug_type
 - payer_id
 - additional_payer_notes
 - hospital_location
 - service_description
 - drug_unit
 - zip_code
 - hospital_address
 - street


In [33]:
spark.sql("DROP TABLE IF EXISTS local.silver.hospital_demo")

DataFrame[]

In [36]:

# Step 5: Write to Iceberg silver table
df_cleaned.writeTo("local.silver.hospital_demo") \
    .using("iceberg") \
    .tableProperty("format-version", "2") \
    .createOrReplace()

                                                                                

In [37]:
spark.sql("SHOW TABLES IN local.silver").show(truncate=False)

+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|silver   |mission_trail_baptist    |false      |
|silver   |hospital_demo            |false      |
|silver   |santa_rosa_new_braunfels |false      |
|silver   |resolute_health          |false      |
|silver   |santa_rosa_westover_hills|false      |
|silver   |north_central_baptist    |false      |
|silver   |santa_rosa_medical_center|false      |
|silver   |baptist_medical_center   |false      |
+---------+-------------------------+-----------+

