# Gold Layer Ingestion

Goal: Transform Silver tables into a dimensional model to facilitate analytics.

Source Tables
- silver.customers
- silver.products
- silver.orders
- silver sales
- silver.countries

Gold Tables:
- gold.dim_customer   (customer attributes and country reference)
- gold.dim_product    (product attributes and manufacturing country)
- gold.dim_country    (geographic and economic attributes)
- gold.dim_date       (calendar dimension)
- gold.fact_sales     (sale-line grain fact table)

Design Notes:
- All dimensions use warehouse-owned surrogate keys.
- Fact table grain: one row per order line item (sale_id).
- Fact table includes plaeholder measures for future enrichment.
- Dimension tables include SCD scaffolding.
- Source system identifiers are retained here as natural keys and not used in joins.




## Imports and Context Setting


### Imports

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql import Window 


### Set Databricks context

In [0]:
CATALOG = "md_sales_dashboard"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"

FAR_FUTURE_TS = F.to_timestamp(F.lit("9999-12-31 00:00:00"))

spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {GOLD_SCHEMA}")
spark.sql(f"USE SCHEMA {GOLD_SCHEMA}")

## Helper Functions Definition
### add_scd_scaffold
1. This helper function adds columns to dim tables to support scd.




In [0]:
def add_scd_scaffold(df: DataFrame) -> DataFrame:
    """
    Add SCD Type 2 scaffolding columns to a dimension DataFrame.
    """
    return (
        df
        .withColumn("valid_from_ts", F.current_timestamp())
        .withColumn("valid_to_ts", FAR_FUTURE_TS)
        .withColumn("is_current", F.lit(True).cast("boolean"))
    )

## Load Gold Delta Tables

### Load Silver Sources





In [0]:
silver_customers = spark.table(f"{CATALOG}.{SILVER_SCHEMA}.customers")
silver_countries = spark.table(f"{CATALOG}.{SILVER_SCHEMA}.countries")
silver_products = spark.table(f"{CATALOG}.{SILVER_SCHEMA}.products")
silver_sales = spark.table(f"{CATALOG}.{SILVER_SCHEMA}.sales")
silver_orders = spark.table(f"{CATALOG}.{SILVER_SCHEMA}.orders")

###*Insert into dim_country*

In [0]:
# NOTE: No partition window is intentional here (global ordering). dim_country is small (<100k rows),
# and we need a surrogate key
w_country = Window.orderBy(F.col("country_id").asc())



dim_country = (
    silver_countries
        .select(
            "country_id",
            "country", 
            "country_name", 
            "currency", 
            "region",
            "population", 
            "area_sq_mi", 
            "pop_density_per_sq_mi", 
            "coastline_coast_per_area_ratio", 
            "net_migration",
            "infant_mortality_per_1000_births", 
            "gdp_per_capita", 
            "literacy_pct", 
            "phones_per_1000",
            "arable_pct", 
            "crops_pct", 
            "other_pct", 
            "climate", 
            "birthrate", 
            "deathrate",
            "agriculture", 
            "industry", 
            "service")
        .dropDuplicates(["country_id"])
        .withColumn("country_key", F.dense_rank().over(w_country))
        .select (
            "country_key",
            "country_id",
            "country", 
            "country_name", 
            "currency",
            "region",
            "population", 
            "area_sq_mi", 
            "pop_density_per_sq_mi", 
            "coastline_coast_per_area_ratio", 
            "net_migration",
            "infant_mortality_per_1000_births", 
            "gdp_per_capita", 
            "literacy_pct", 
            "phones_per_1000",
            "arable_pct", 
            "crops_pct", 
            "other_pct", 
            "climate", 
            "birthrate", 
            "deathrate",
            "agriculture", 
            "industry", 
            "service"
        )
)

# Add scd scaffolding columns
dim_country = add_scd_scaffold(dim_country)

# Write to Gold
(
dim_country.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{CATALOG}.{GOLD_SCHEMA}.dim_country")
)



####*Insert into dim_customer*

In [0]:
# NOTE: No partition window is intentional here (global ordering). dim_customers is small (<100k rows),
# and we need a surrogate key

w_customer = Window.orderBy(F.col("customer_id").asc())

dim_customer = (
    silver_customers
        .select(
            "customer_id",
            "is_active",
            "full_name",
            "address",
            "city",
            "country_id",
            "email")
        .dropDuplicates(["customer_id"])
        .withColumn("customer_key", F.dense_rank().over(w_customer))
        .join(
            dim_country.select("country_id","country_key"),
            on="country_id",
            how="left"
        )
        .select (
            "customer_key",
            "customer_id",
            "is_active",
            "full_name", 
            "address",
            "city",
            "email",
            "country_key"
        ) 
)
# Add scd scaffolding columns
dim_customer = add_scd_scaffold(dim_customer)

# Write to Gold
(
dim_customer.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{CATALOG}.{GOLD_SCHEMA}.dim_customer")
)

####*Insert into dim_product*

In [0]:
# NOTE: No partition window is intentional here (global ordering). dim_products is small (<100k rows),
# and we need a surrogate key

w_product = Window.orderBy(F.col("product_id").asc())


dim_product = (
    silver_products
        .select(
            "product_id",
            "product_name",
            "manufactured_country_id",
            "weight_in_grams"
            )
        .dropDuplicates(["product_id"])
        .withColumn("product_key", F.dense_rank().over(w_product))
        .join(
            dim_country
                .select(F.col("country_id").alias("manufactured_country_id"),
                        F.col("country_key").alias("manufactured_country_key")),
            on="manufactured_country_id",
            how="left"
            )
            .select (
                "product_key",
                "product_id",
                "product_name",
                "weight_in_grams",
                "manufactured_country_key"
            )
)

# Add scd scaffolding columns
dim_product = add_scd_scaffold(dim_product)

# Write to Gold
(
dim_product.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{CATALOG}.{GOLD_SCHEMA}.dim_product")
)


####*Build dim_date*

In [0]:
date_boundaries = silver_orders.select(
    F.min("date").alias("min_date"),
    F.max("date").alias("max_date")
).collect()[0]

min_order_date = date_boundaries["min_date"]
max_order_date = date_boundaries["max_date"]


dim_date = (
    spark.sql(f"""
      SELECT explode(sequence(to_date('{min_order_date}'), to_date('{max_order_date}'), interval 1 day)) AS calendar_date
    """)
    .withColumn("date_key", F.date_format(F.col("calendar_date"), "yyyyMMdd").cast("int"))
    .withColumn("year", F.year("calendar_date"))
    .withColumn("quarter", F.quarter("calendar_date"))
    .withColumn("month", F.month("calendar_date"))
    .withColumn("month_name", F.date_format("calendar_date", "MMMM"))
    .withColumn("day", F.dayofmonth("calendar_date"))
    .withColumn("day_of_week", F.date_format("calendar_date", "E"))
    .withColumn("is_weekend", F.dayofweek("calendar_date").isin([1, 7]))
    .select("date_key", "calendar_date", "year", "quarter", "month", "month_name", "day", "day_of_week", "is_weekend")
)


(
dim_date.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{CATALOG}.{GOLD_SCHEMA}.dim_date")
)


####*Build fact_sales*

In [0]:
sales_stg = (
    silver_sales.alias("s")
        .join(silver_orders.alias("o"),
              on="order_id",
              how="left"    
              )
        .select(
            F.col("s.sale_id").alias("sale_id"),
            F.col("s.order_id").alias("order_id"),
            F.col("o.date").alias("date"),
            F.col("o.customer_id").alias("customer_id"),
            F.col("s.product_id").alias("product_id"),
            F.col("s.quantity")
        )
    
)

fact_sales= (
    sales_stg
    .join(dim_date.select("calendar_date", "date_key"),
        sales_stg["date"] == F.col("calendar_date"), how="left")
    .drop("date")
    .join(dim_customer.select("customer_id", "customer_key"), on="customer_id", how="left")
      .join(dim_product.select("product_id", "product_key"), on="product_id", how="left")
      .select(
          "sale_id",
          "order_id",                     
          "date_key",
          "customer_key",
          "product_key",
          "quantity",
          # placeholders
          F.lit(None).cast("decimal(18,2)").alias("unit_price"),
          F.lit(None).cast("decimal(18,2)").alias("sales_amount"),
          F.lit(None).cast("decimal(18,2)").alias("discount_amount")
      )
)


# Write to Gold
(
    fact_sales.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(f"{CATALOG}.{GOLD_SCHEMA}.fact_sales")
)

In [0]:
print("dim_country:", spark.table(f"{CATALOG}.{GOLD_SCHEMA}.dim_country").count())
print("dim_customer:", spark.table(f"{CATALOG}.{GOLD_SCHEMA}.dim_customer").count())
print("dim_product:",  spark.table(f"{CATALOG}.{GOLD_SCHEMA}.dim_product").count())
print("dim_date:",     spark.table(f"{CATALOG}.{GOLD_SCHEMA}.dim_date").count())
print("fact_sales:",   spark.table(f"{CATALOG}.{GOLD_SCHEMA}.fact_sales").count())

# Check for unmapped keys (should be 0)
spark.table(f"{CATALOG}.{GOLD_SCHEMA}.fact_sales").select(
    F.sum(F.col("date_key").isNull().cast("int")).alias("null_date_key"),
    F.sum(F.col("customer_key").isNull().cast("int")).alias("null_customer_key"),
    F.sum(F.col("product_key").isNull().cast("int")).alias("null_product_key"),
).show()
