In [0]:
# --------------------------------
# Gold Layer - Medallion Architecture (Service Principal Auth)
# --------------------------------

# Retrieve Service Principal credentials securely from Azure Key Vault via Databricks secrets
client_id = dbutils.secrets.get(scope="secretscope_datacapus6", key="client-id")
client_secret = dbutils.secrets.get(scope="secretscope_datacapus6", key="client-secret")
tenant_id = dbutils.secrets.get(scope="secretscope_datacapus6", key="tenant-id")

storage_account = "storageaccus6"
silver_container = "silver"
gold_container = "gold"

# Configure Spark to use Service Principal OAuth for storage account
spark.conf.set("fs.azure.account.auth.type", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Base paths for Silver and Gold containers
silver_base = f"abfss://{silver_container}@{storage_account}.dfs.core.windows.net/"
gold_base = f"abfss://{gold_container}@{storage_account}.dfs.core.windows.net/"

# Paths to Silver Delta tables
silver_house_path = f"{silver_base}house_price"
silver_orders_path = f"{silver_base}sales_products"
silver_world_path = f"{silver_base}population"

# Paths to Gold Delta tables
gold_house_path = f"{gold_base}house_price_summary"
gold_orders_path = f"{gold_base}sales_orders_summary"
gold_world_path = f"{gold_base}world_population_summary"

# --------------------------------
# Read Silver Delta tables
# --------------------------------
df_house_silver = spark.read.format("delta").load(silver_house_path)
df_orders_silver = spark.read.format("delta").load(silver_orders_path)
df_world_silver = spark.read.format("delta").load(silver_world_path)

from pyspark.sql import functions as F

# --------------------------------
# Gold Transformations (Business KPIs)
# --------------------------------

df_house_gold = (
    df_house_silver
    .withColumn("json_length", F.length(F.col("value_clean")))
    .agg(
        F.count("*").alias("total_records"),
        F.avg("json_length").alias("avg_record_length")
    )
    .withColumn("processed_at", F.current_timestamp())
)

df_orders_gold = (
    df_orders_silver
    .agg(
        F.count("*").alias("total_orders"),
        F.countDistinct("value_clean").alias("unique_orders")
    )
    .withColumn("processed_at", F.current_timestamp())
)

df_world_gold = (
    df_world_silver
    .agg(
        F.count("*").alias("total_population_records"),
        F.countDistinct("value_clean").alias("unique_population_entries")
    )
    .withColumn("processed_at", F.current_timestamp())
)

# --------------------------------
# Write Gold Delta tables
# --------------------------------
df_house_gold.write.format("delta").mode("overwrite").save(gold_house_path)
df_orders_gold.write.format("delta").mode("overwrite").save(gold_orders_path)
df_world_gold.write.format("delta").mode("overwrite").save(gold_world_path)

# --------------------------------
# Delta Lake Time Travel (Gold)
# --------------------------------

df_house_gold_v0 = spark.read.format("delta").option("versionAsOf", 0).load(gold_house_path)
df_orders_gold_v0 = spark.read.format("delta").option("versionAsOf", 0).load(gold_orders_path)
df_world_gold_v0 = spark.read.format("delta").option("versionAsOf", 0).load(gold_world_path)

print("Gold House Price Summary (version 0):")
display(df_house_gold_v0)

print("Gold Sales Orders Summary (version 0):")
display(df_orders_gold_v0)

print("Gold World Population Summary (version 0):")
display(df_world_gold_v0)