In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Healthcare Export") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

# Suppress all WARNs logs
spark.sparkContext.setLogLevel("ERROR")

25/05/02 14:52:30 WARN Utils: Your hostname, NEELs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.190.186.201 instead (on interface en0)
25/05/02 14:52:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/02 14:52:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/02 14:52:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/02 14:52:32 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
spark.sql("SHOW TABLES IN local.bronze").show(truncate=False)

                                                                                

+---------+----------------------------+-----------+
|namespace|tableName                   |isTemporary|
+---------+----------------------------+-----------+
|bronze   |baptist_medical_center_sa_tx|false      |
+---------+----------------------------+-----------+



In [3]:
jdbc_url = "jdbc:postgresql://localhost:5432/healthcare_insurance"
jdbc_props = {
    "user": "postgres", 
    "password": "201970",
    "driver": "org.postgresql.Driver"
}

In [8]:
df = spark.read.table("local.bronze.baptist_medical_center_sa_tx")
try:
    df.write \
        .format("jdbc") \
        .option("url", jdbc_url) \
        .option("dbtable", "baptist_bronze") \
        .options(**jdbc_props) \
        .mode("append") \
        .save()
    print(f"✅ Exported baptist_bronze to PostgreSQL")
except Exception as e:
    print(f"❌ Failed to export baptist_bronze: {e}")
    raise


[Stage 4:>                                                          (0 + 1) / 1]

✅ Exported baptist_bronze to PostgreSQL


                                                                                

In [14]:
spark.sql("Select * from local.bronze.kindred_ontario").show(truncate=False)

+--------+
|count(1)|
+--------+
|150     |
+--------+



In [6]:
# Data Cleansing Notebook
# -------------------------------------
from pyspark.sql.functions import col, trim, current_timestamp, lit

# Step 1: Read from Bronze Table
df_bronze = spark.read.format("iceberg").load("local.bronze.kindred_ontario")
df_bronze.printSchema()

root
 |-- provider_id: string (nullable = true)
 |-- hospital_name: string (nullable = true)
 |-- hospital_address: string (nullable = true)
 |-- hospital_location: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_number: string (nullable = true)
 |-- license_state: string (nullable = true)
 |-- service_description: string (nullable = true)
 |-- code: string (nullable = true)
 |-- code_type: string (nullable = true)
 |-- care_setting: string (nullable = true)
 |-- gross_charge: double (nullable = true)
 |-- discounted_cash: double (nullable = true)
 |-- min_charge: double (nullable = true)
 |-- max_charge: double (nullable = true)
 |-- payer_name: string (nullable = true)
 |-- plan_name: string (nullable = true)
 |-- methodology: string (nullable = true)
 |-- negotiated_dollar_amount: double (nullable = true)
 |-- additional_payer_notes: string (nullable = true)



In [5]:
df_bronze = spark.read.format("iceberg").load("local.bronze.baptist_medical_center_sa_tx")
df_bronze.printSchema()


root
 |-- provider_id: string (nullable = true)
 |-- hospital_name: string (nullable = true)
 |-- hospital_address: string (nullable = true)
 |-- hospital_location: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_number: string (nullable = true)
 |-- license_state: string (nullable = true)
 |-- service_description: string (nullable = true)
 |-- code: string (nullable = true)
 |-- code_type: string (nullable = true)
 |-- care_setting: string (nullable = true)
 |-- gross_charge: double (nullable = true)
 |-- discounted_cash: double (nullable = true)
 |-- min_charge: long (nullable = true)
 |-- max_charge: long (nullable = true)
 |-- payer_name: string (nullable = true)
 |-- plan_name: string (nullable = true)
 |-- methodology: string (nullable = true)
 |-- negotiated_dollar_amount: long (nullable = true)
 |-- additional_payer_notes: string (nullable = true)



In [21]:
from pyspark.sql.functions import col, sum as spark_sum, isnan, when
import pandas as pd

# Step 1: Count nulls and NaNs in each column
null_counts_row = df_clean.select([
    spark_sum(when(col(c).isNull() | isnan(col(c)), 1).otherwise(0)).alias(c)
    for c in df_clean.columns
]).collect()[0]

# Step 2: Convert the results into a Pandas DataFrame for easy viewing
null_counts_dict = null_counts_row.asDict()
null_counts_df = pd.DataFrame(list(null_counts_dict.items()), columns=["Column", "Missing_Count"])

# Step 3: Optional – sort by most missing
null_counts_df = null_counts_df.sort_values(by="Missing_Count", ascending=False)

# Step 4: Show the result
from IPython.display import display
display(null_counts_df)

Unnamed: 0,Column,Missing_Count
0,provider_id,0
1,hospital_name,0
18,negotiated_dollar_amount,0
17,methodology,0
16,plan_name,0
15,payer_name,0
14,max_charge,0
13,min_charge,0
12,discounted_cash,0
11,gross_charge,0


In [None]:
# Step 1: Drop records with critical nulls
df_clean = df_bronze.dropna(subset=["min_charge", "max_charge"])


In [18]:
from pyspark.sql.functions import when, col

df_clean = df_clean.withColumn(
    "gross_charge",
    when(
        col("gross_charge").isNull() & col("min_charge").isNotNull() & col("max_charge").isNotNull(),
        (col("min_charge") + col("max_charge")) / 2
    ).otherwise(col("gross_charge"))
)

In [None]:
df_clean = df_clean.withColumn(
    "standard_charge_dollar",
    when(
        col("standard_charge_dollar").isNull(), col("gross_charge")
    ).otherwise(col("standard_charge_dollar"))
)

In [19]:
df_clean = df_clean.withColumn(
    "discounted_cash",
    when(
        col("discounted_cash").isNull(), col("gross_charge")
    ).otherwise(col("discounted_cash"))
)

In [23]:
# Step 1: Drop records with critical nulls
df_clean = df_clean.dropna(subset=["provider_id", "hospital_name", "service_description", "gross_charge"])


In [24]:
# Step 2: Deduplicate
df_clean = df_clean.dropDuplicates()

In [25]:
from pyspark.sql.functions import col, trim, current_timestamp, lit,upper,to_date
# Step 3: Format standardization
df_clean = df_clean \
    .withColumn("payer_name", upper(trim(col("payer_name")))) \
    .withColumn("plan_name", upper(trim(col("plan_name")))) \
    .withColumn("last_updated_on", to_date("last_updated_on", "yyyy-MM-dd"))


In [9]:
df_clean.write.mode("overwrite").parquet("/Users/neelkalavadiya/Practicum_Project_Local/iceberg_warehouse/checkpoint_parquet/cleaning")

                                                                                