#NOTEBOOK 4/6: GOLD TABLE

##1. FIXING THE SCHEMA AND READING SILVER DATA

In [0]:
# Selecting gold schema and healthcare_analytics catalog
spark.sql("USE CATALOG healthcare_analytics")
spark.sql("USE SCHEMA gold")
spark.sql("SELECT current_catalog(), current_schema()").show()

In [0]:
# read silver table and verify the number of rows
silver_df = spark.table("healthcare_analytics.silver.silver_events")
print(f"Silver records: {silver_df.count()}")

In [0]:
# Show sample data
display(silver_df.limit(10))

In [0]:
# importing pyspark sql functions
from pyspark.sql.functions import *

##2. CREATE GOLD TABLE READMISSION BY SPECIALTY (Table 1)

In [0]:
# Create specialty metrics

specialty_df = silver_df.filter(
    col("medical_specialty") != "Unknown"
).groupBy("medical_specialty").agg(
    count("*").alias("total_encounters"),
    sum("readmitted_30days").alias("readmissions_30day"),
    round(avg("readmitted_30days") * 100, 2).alias("readmission_rate_pct"),
    round(avg("time_in_hospital"), 2).alias("avg_length_of_stay"),
    round(avg("num_medications"), 2).alias("avg_medications")
).filter(
    col("total_encounters") >= 100
).orderBy("readmission_rate_pct", ascending=False)

In [0]:
# Write to gold table 1
gold_specialty = "healthcare_analytics.gold.readmission_by_specialty"

specialty_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_specialty)

print("Specialty table created!")

##3. CREATE GOLD TABLE PATIENT SUMMARY (Table 2)

In [0]:
# Create patient metrics

patient_df = silver_df.groupBy("patient_nbr").agg(
    count("encounter_id").alias("total_visits"),
    sum("readmitted_30days").alias("total_readmissions"),
    max("num_medications").alias("max_medications"),
    max("number_diagnoses").alias("max_diagnoses"),
    max("time_in_hospital").alias("longest_stay"),
    max("age_group").alias("latest_age_group")
).withColumn(
    "patient_risk_tier",
    when(col("total_readmissions") >= 3, "Critical")
    .when(col("total_readmissions") >= 2, "High")
    .when(col("total_readmissions") >= 1, "Medium")
    .otherwise("Low")
)


In [0]:
# Write to gold table 2
gold_patient = "healthcare_analytics.gold.patient_summary"

patient_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_patient)

print("Patient summary created!")


##4. CREATE GOLD TABLE AGE GROUP ANALYSIS (Table 3)

In [0]:
# Create age group metrics

age_df = silver_df.groupBy("age_group").agg(
    count("*").alias("total_encounters"),
    sum("readmitted_30days").alias("readmissions"),
    round(avg("readmitted_30days") * 100, 2).alias("readmission_rate_pct"),
    round(avg("time_in_hospital"), 2).alias("avg_los"),
    round(avg("num_medications"), 2).alias("avg_medications")
).orderBy("readmission_rate_pct", ascending=False)

In [0]:
# Write to gold table 3
gold_age = "healthcare_analytics.gold.age_group_analysis"

age_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_age)

print("Age group analysis created!")

##5. CREATE GOLD TABLE AGE MEDICATION IMPACT (Table 4)

In [0]:
# Create medication analysis

medication_df = silver_df.groupBy("num_medications_category").agg(
    count("*").alias("patient_count"),
    sum("readmitted_30days").alias("readmissions"),
    round(avg("readmitted_30days") * 100, 2).alias("readmission_rate_pct"),
    round(avg("time_in_hospital"), 2).alias("avg_los")
).orderBy("readmission_rate_pct", ascending=False)

In [0]:
# Write to gold table 4
gold_medication = "healthcare_analytics.gold.medication_impact"

medication_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_medication)

print("Medication impact created!")

##6. OPTIMISING AND COMPARING THE VALUES

In [0]:
# Optimizing tables
spark.sql("OPTIMIZE healthcare_analytics.gold.readmission_by_specialty")
spark.sql("OPTIMIZE healthcare_analytics.gold.patient_summary")
spark.sql("OPTIMIZE healthcare_analytics.gold.age_group_analysis")
spark.sql("OPTIMIZE healthcare_analytics.gold.medication_impact")
print("All gold tables optimized!")

In [0]:
#Comparing the number of records in the gold tables
print("=" * 60)
print("GOLD LAYER COMPLETE")
print("=" * 60)
print(f"Specialty metrics: {spark.table(gold_specialty).count()} specialties")
print(f"Patient summaries: {spark.table(gold_patient).count()} patients")
print(f"Age groups: {spark.table(gold_age).count()} groups")
print(f"Medication categories: {spark.table(gold_medication).count()} categories")
print("=" * 60)