# TRANSFORM SILVER LAYER DATA TO GOLD LAYER

In [1]:
# IMPORT RELEVANT PACKAGES

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, IntegerType, LongType, BooleanType, DateType, TimestampType, DecimalType

silver_schema = "silver"
gold_schema = "gold"

print(f"Starting silver to gold transformation.")
print(f"Reading from silver schema: {silver_schema}")
print(f"Writing to Gold Schema: {gold_schema}")
print("-" * 30)

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 3, Finished, Available, Finished)

Starting silver to gold transformation.
Reading from silver schema: silver
Writing to Gold Schema: gold
------------------------------


### Generate dim_date table

In [2]:
# Generate Date Dimension covering the range of transaction dates

print("Generating Dimension: dim_date")

# Find min and max transaction dates from the silver transaction details
min_max_dates = spark.read.format("delta").table(f"{silver_schema}.silver_transaction_details") \
    .agg(F.min("transaction_ts").alias("min_date"), F.max("transaction_ts").alias("max_date")) \
    .first()

start_date = min_max_dates["min_date"].date()
end_date = min_max_dates["max_date"].date()

print(f"Date range for dimension: {start_date} to {end_date}")

# Generate sequence of dates using Spark SQL
date_df = spark.sql(f"SELECT explode(sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day)) AS calendar_date")

# Add date attributes
dim_date = date_df.select(
    F.col("calendar_date"),
    F.date_format(F.col("calendar_date"), "yyyyMMdd").cast(IntegerType()).alias("DateKey"),
    F.year(F.col("calendar_date")).alias("Year"),
    F.month(F.col("calendar_date")).alias("Month"),
    F.dayofmonth(F.col("calendar_date")).alias("Day"),
    F.quarter(F.col("calendar_date")).alias("Quarter"),
    F.date_format(F.col("calendar_date"), "E").alias("DayNameShort"), # e.g., Mon
    F.date_format(F.col("calendar_date"), "EEEE").alias("DayNameLong"),  # e.g., Monday
    F.dayofweek(F.col("calendar_date")).alias("DayOfWeek"), # Sunday=1, Saturday=7
    F.dayofyear(F.col("calendar_date")).alias("DayOfYear"),
    F.weekofyear(F.col("calendar_date")).alias("WeekOfYear"),
    F.date_format(F.col("calendar_date"), "MMMM").alias("MonthNameLong"), # e.g., January
    F.date_format(F.col("calendar_date"), "MMM").alias("MonthNameShort"), # e.g., Jan
    (F.dayofweek(F.col("calendar_date")).isin([1, 7])).alias("IsWeekend") # Check if Sunday (1) or Saturday (7)
)

# Save the dimension table
gold_table_full_name = f"{gold_schema}.dim_date"
dim_date.write.format("delta").mode("overwrite").saveAsTable(gold_table_full_name)
print(f"Successfully generated and saved: {gold_table_full_name}")
# display(dim_date.limit(5))

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 4, Finished, Available, Finished)

Generating Dimension: dim_date
Date range for dimension: 2023-01-01 to 2023-12-31
Successfully generated and saved: gold.dim_date


#### Dim_customer table

In [3]:
print("generating Dimension: dim_customer")

# LOAD SILVE CUSTOMERS

df_silver_customers = spark.read.format("delta").table(f"{silver_schema}.silver_customers")

# ADD SURROGATE KEY 
dim_customer = df_silver_customers.withColumn("CustomerKey", F.monotonically_increasing_id() + (1 <<33))



StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 5, Finished, Available, Finished)

generating Dimension: dim_customer


In [4]:
# SELECT FINAL columns FOR THE CUSTOMER DIMENSION

dim_customer = dim_customer.select(
    F.col("CustomerKey"),
    F.col("customer_id").alias("CustomerID"), # Keep business key
    F.col("first_name").alias("FirstName"),
    F.col("last_name").alias("LastName"),
    F.col("email_address").alias("EmailAddress"),
    F.col("signup_ts").alias("SignupTimestamp"),
    F.col("consent_email").alias("HasEmailConsent"),
    F.col("consent_sms").alias("HasSmsConsent"),
    F.col("initial_store").alias("SignupStoreID")
)

gold_table_full_name = f"{gold_schema}.dim_customer"
dim_customer.write.format("delta").mode("overwrite").saveAsTable(gold_table_full_name)
print(f"Successfully generated and saved: {gold_table_full_name}")

display(dim_customer.limit(5))



StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 6, Finished, Available, Finished)

Successfully generated and saved: gold.dim_customer


SynapseWidget(Synapse.DataFrame, 95ffdc48-5653-4a3a-84ce-bd671bf81703)

##### DIM_STORE TABLE


In [5]:
# create dim_store

print("Generating dimension : dim_store")

# LOAD SILVER STORES

df_silver_stores = spark.read.format("delta").table(f"{silver_schema}.silver_stores")

# ADD SURROGATE KEY
dim_store = df_silver_stores.withColumn("StoreKey",F.monotonically_increasing_id() + (1<< 33))

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 7, Finished, Available, Finished)

Generating dimension : dim_store


In [6]:
# SELECT FINAL DIM_ store

dim_store = dim_store.select(
    F.col("StoreKey"),
    F.col("store_id").alias("StoreID"),
    F.col("location_name").alias("LocationName"),
    F.col("store_manager").alias("StoreMana"),
    F.col("size_sqm").alias("SizeSQM"),
    F.col("open_date").alias("OpenDate")
)

# Save the dimension table
gold_table_full_name = f"{gold_schema}.dim_store"
dim_store.write.format("delta").mode("overwrite").saveAsTable(gold_table_full_name)
print(f"Successfully generated and saved: {gold_table_full_name}")
display(dim_store.limit(5))

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 8, Finished, Available, Finished)

Successfully generated and saved: gold.dim_store


SynapseWidget(Synapse.DataFrame, 7a9e3629-6d69-454f-b56d-4892ee38fa52)

### Create dim_promotion

In [8]:
print("Generating Dimension: dim_promotion")

# Load Silver Promotions
df_silver_promotions = spark.read.format("delta").table(f"{silver_schema}.silver_promotions")

# Add Surrogate Key to existing promotions
dim_promotion_base = df_silver_promotions.withColumn("PromotionKey", F.monotonically_increasing_id() + (1 << 33) + 1) # Start keys > 0

# Select columns
dim_promotion_base = dim_promotion_base.select(
    F.col("PromotionKey"),
    F.col("promotion_code").alias("PromotionCode"), # Keep business key
    F.col("promotion_name").alias("PromotionName"),
    F.col("promotion_type").alias("PromotionType"),
    F.col("discount_value").alias("DiscountValue"),
    F.col("start_date").alias("StartDate"),
    F.col("end_date").alias("EndDate")
)

# Create 'No Promotion' record
schema = dim_promotion_base.schema # Get schema from existing data
no_promo_df = spark.createDataFrame([(0, "NO_PROMO", "No Promotion", "None", 0.0, None, None)], schema)

# Union 'No Promotion' with other promotions
dim_promotion = no_promo_df.unionByName(dim_promotion_base)

# Save the dimension table
gold_table_full_name = f"{gold_schema}.dim_promotion"
dim_promotion.write.format("delta").mode("overwrite").saveAsTable(gold_table_full_name)
print(f"Successfully generated and saved: {gold_table_full_name}")
display(dim_promotion.limit(5))

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 10, Finished, Available, Finished)

Generating Dimension: dim_promotion
Successfully generated and saved: gold.dim_promotion


SynapseWidget(Synapse.DataFrame, c5b184cc-7201-43ad-b30f-866a7cd45f66)

#### DIM PRODUCT

In [9]:
print("Generating Dimension: dim_product")

# Load Silver Products
df_silver_products = spark.read.format("delta").table(f"{silver_schema}.silver_products")

# Add Surrogate Key
dim_product = df_silver_products.withColumn("ProductKey", F.monotonically_increasing_id() + (1 << 33))

# Select final columns
dim_product = dim_product.select(
    F.col("ProductKey"),
    F.col("product_id").alias("ProductID"), # Keep business key
    F.col("product_description").alias("ProductDescription"),
    F.col("category").alias("Category"),
    F.col("brand").alias("Brand"),
    F.col("supplier").alias("Supplier"),
    F.col("current_unit_price").alias("CurrentUnitPrice"), # Note: This is the *current* price
    F.col("current_unit_cost").alias("CurrentUnitCost"),   # Note: This is the *current* cost
    F.col("date_added").alias("DateAdded"),
    F.col("is_active").alias("IsActive")
)

# Save the dimension table
gold_table_full_name = f"{gold_schema}.dim_product"
dim_product.write.format("delta").mode("overwrite").saveAsTable(gold_table_full_name)
print(f"Successfully generated and saved: {gold_table_full_name}")
display(dim_product.limit(5))

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 11, Finished, Available, Finished)

Generating Dimension: dim_product
Successfully generated and saved: gold.dim_product


SynapseWidget(Synapse.DataFrame, 4f0b93b9-4212-4bd7-9f9d-fa2fb41f3aca)

#### CREATE FCT_SALES TABLE


In [11]:
print("Generating Fact Table: fct_sales")

# Load Silver transaction details
df_silver_details = spark.read.format("delta").table(f"{silver_schema}.silver_transaction_details")

# Load Gold dimensions for Key lookups (selecting only needed keys)
dim_date_lookup = spark.read.format("delta").table(f"{gold_schema}.dim_date") \
    .select(F.col("calendar_date").alias("TransactionDate"), F.col("DateKey"))

dim_customer_lookup = spark.read.format("delta").table(f"{gold_schema}.dim_customer") \
    .select(F.col("CustomerID"), F.col("CustomerKey"))

dim_product_lookup = spark.read.format("delta").table(f"{gold_schema}.dim_product") \
    .select(F.col("ProductID"), F.col("ProductKey"), F.col("CurrentUnitCost").alias("UnitCost")) # Get cost here

dim_store_lookup = spark.read.format("delta").table(f"{gold_schema}.dim_store") \
    .select(F.col("StoreID"), F.col("StoreKey"))

dim_promotion_lookup = spark.read.format("delta").table(f"{gold_schema}.dim_promotion") \
    .select(F.col("PromotionCode"), F.col("PromotionKey"), F.col("PromotionType"), F.col("DiscountValue"))

# --- Prepare Transaction Data ---
# Cast transaction date for joining with Dim_Date
df_trans = df_silver_details.withColumn("TransactionDate", F.to_date(F.col("transaction_ts")))

# --- Perform Joins to get Surrogate Keys ---
# Use left joins to avoid losing facts if a dimension key is missing (handle nulls later)
df_joined = df_trans \
    .join(dim_date_lookup, on="TransactionDate", how="left") \
    .join(dim_customer_lookup, df_trans["customer_id"] == dim_customer_lookup["CustomerID"], how="left") \
    .join(dim_product_lookup, df_trans["product_id"] == dim_product_lookup["ProductID"], how="left") \
    .join(dim_store_lookup, df_trans["store_id"] == dim_store_lookup["StoreID"], how="left") \
    .join(dim_promotion_lookup, df_trans["promotion_code_applied"] == dim_promotion_lookup["PromotionCode"], how="left")

# --- Handle Missing Dimension Keys (Assign defaults, e.g., 0 for Promotion) ---
df_keys_handled = df_joined.select(
    # Surrogate Keys (use coalesce to assign default key if lookup failed)
    F.coalesce(F.col("DateKey"), F.lit(-1)).alias("DateKey"), # Assign -1 for unknown date (shouldn't happen with generation)
    F.coalesce(F.col("CustomerKey"), F.lit(-1)).alias("CustomerKey"), # Assign -1 for unknown customer
    F.coalesce(F.col("ProductKey"), F.lit(-1)).alias("ProductKey"), # Assign -1 for unknown product
    F.coalesce(F.col("StoreKey"), F.lit(-1)).alias("StoreKey"), # Assign -1 for unknown store
    F.coalesce(F.col("PromotionKey"), F.lit(0)).alias("PromotionKey"), # Assign 0 (No Promotion) if null or lookup failed

    # Degenerate Dimensions & Core transaction info needed for measures
    F.col("transaction_id"),
    F.col("transaction_line_id"),
    F.col("transaction_ts"),
    F.col("transaction_type"), # 'Sale' or 'Return'
    F.col("quantity"),
    F.col("unit_price_recorded"),
    F.col("line_item_gross_amount"),
    F.coalesce(F.col("UnitCost"), F.lit(0.0)).alias("unit_cost"), # Assign 0 cost if product unknown
    F.col("PromotionType"),
    F.col("DiscountValue")
)

# --- Calculate Measures ---
df_measures = df_keys_handled \
    .withColumn("TotalCostAmount", F.round(F.col("quantity") * F.col("unit_cost"), 2)) \
    .withColumn("DiscountAmount", F.round(
        F.when( (F.col("transaction_type") == 'Sale') & (F.col("PromotionType") == 'Percentage'),
               F.col("line_item_gross_amount") * F.col("DiscountValue") ) \
        .otherwise(F.lit(0.0)) # No discount for returns or non-percentage promos
    , 2)) \
    .withColumn("NetSalesAmount", F.round(F.col("line_item_gross_amount") - F.col("DiscountAmount"), 2))

# Calculate Profit
df_measures = df_measures.withColumn("ProfitAmount", F.round(F.col("NetSalesAmount") - F.col("TotalCostAmount"), 2))

# --- Handle Returns (Apply negation) ---
df_final_fact = df_measures.withColumn("Multiplier", F.when(F.col("transaction_type") == 'Return', F.lit(-1)).otherwise(F.lit(1))) \
    .select(
        F.col("DateKey"),
        F.col("CustomerKey"),
        F.col("ProductKey"),
        F.col("StoreKey"),
        F.col("PromotionKey"),
        F.col("transaction_id").alias("TransactionID"), # Degenerate Dimension
        F.col("transaction_line_id").alias("TransactionLineID"), # Degenerate Dimension
        F.col("transaction_ts").alias("TransactionTimestamp"),
        (F.col("quantity") * F.col("Multiplier")).alias("Quantity"), # Negate if return
        F.col("unit_price_recorded").alias("UnitPrice"),
        F.col("unit_cost").alias("UnitCost"),
        (F.col("line_item_gross_amount") * F.col("Multiplier")).alias("GrossSalesAmount"), # Negate if return
        (F.col("DiscountAmount") * F.col("Multiplier")).alias("DiscountAmount"), # Negate if return
        (F.col("NetSalesAmount") * F.col("Multiplier")).alias("NetSalesAmount"), # Negate if return
        (F.col("TotalCostAmount") * F.col("Multiplier")).alias("TotalCostAmount"), # Negate if return
        (F.col("ProfitAmount") * F.col("Multiplier")).alias("ProfitAmount") # Negate if return
    )

# --- Save the Fact Table ---
gold_table_full_name = f"{gold_schema}.fct_sales"
# Consider partitioning if the table is large, e.g., by DateKey or Year/Month
# .partitionBy("Year", "Month") # Example partitioning columns 
df_final_fact.write.format("delta").mode("overwrite").saveAsTable(gold_table_full_name)

print(f"Successfully generated and saved: {gold_table_full_name}")
display(df_final_fact.limit(10))

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 13, Finished, Available, Finished)

Generating Fact Table: fct_sales
Successfully generated and saved: gold.fct_sales


SynapseWidget(Synapse.DataFrame, 5c49df8f-fbc3-46ee-aba2-db40121a8f0d)

In [12]:
print("="*40)
print("Gold Layer Transformation Script Completed.")
print(f"Dimension and Fact tables saved to schema: {gold_schema}")
print("="*40)

StatementMeta(, 3f6ff14e-60aa-4bff-ba17-6a03df53709b, 14, Finished, Available, Finished)

Gold Layer Transformation Script Completed.
Dimension and Fact tables saved to schema: gold
