In [0]:
# Imports
from pyspark.sql import functions as F
from pyspark.sql import DataFrame

# Tables
CUSTOMERS_SILVER = "retail_project.silver.customers"
LOYALTY_SILVER = "retail_project.silver.loyalty_segments"
DIM_CUSTOMER_GOLD = "retail_project.gold.dim_customer"


In [0]:
# Read silver tables
df_customers = spark.table(CUSTOMERS_SILVER)
df_loyalty = spark.table(LOYALTY_SILVER)
display(df_customers.limit(5))
display(df_loyalty.limit(5))



customer_key,valid_to,tax_id,tax_code,customer_name,ship_to_address,units_purchased,loyalty_segment,customer_id,bronze_read_timestamp
11123757#1532824233,1548137353.0,,,"SMITH, SHIRLEY","IN, 46506.0, N CENTER ST, 521.0",34,3,11123757,2025-12-13T15:38:02.230Z
30585978#1523100473,,,,"STEPHENS, GERALDINE M","OR, 0, NO SITUS, nan",18,3,30585978,2025-12-13T15:38:02.230Z
349822#1522922493,,,,"GUZMAN, CARMEN","VA, 22181, HILL RD, 2860",5,0,349822,2025-12-13T15:38:02.230Z
27652636#1531834357,1558052195.0,,,"HASSETT, PATRICK J","WI, 53058.0, IVY LANE, W333N 5591",7,1,27652636,2025-12-13T15:38:02.230Z
14437343#1517227530,,,,"HENTZ, DIANA L","OH, 43228.0, ALLIANCE WAY, 5706",0,0,14437343,2025-12-13T15:38:02.230Z


loyalty_segment_id,loyalty_segment_description,unit_threshold,valid_from,valid_to,bronze_read_timestamp
1,level_1,10,2017-01-01,,2025-12-13T16:15:02.069Z
3,level_3,70,2017-01-01,,2025-12-13T16:15:02.069Z
0,level_0,0,2017-01-01,,2025-12-13T16:15:02.069Z
2,level_2,30,2017-01-01,,2025-12-13T16:15:02.069Z


In [0]:
# Select & Standardize Customer Columns
df_customers_sel = df_customers.select(
    F.col("customer_key").cast("string"),
    F.col("customer_id").cast("string"),
    F.col("customer_name").cast("string"),
    F.col("tax_id").cast("string"),
    F.col("tax_code").cast("string"),
    F.col("loyalty_segment").alias("loyalty_segment_id").cast("string"),
    F.col("ship_to_address").cast("string")
)

In [0]:
# Prepare Loyalty Segments Lookup
df_loyalty_sel = df_loyalty.select(
    F.col("loyalty_segment_id").cast("string"),
    F.col("loyalty_segment_description").cast("string")
)

In [0]:
# Join Customers â†’ Loyalty Segments
df_dim_customer = (
    df_customers_sel
    .join(
        df_loyalty_sel,
        on="loyalty_segment_id",
        how="left"
    )
)


In [0]:
# Final Column Order
df_dim_customer_final = df_dim_customer.select(
    "customer_key",                  # PK
    "customer_id",
    "customer_name",
    "tax_id",
    "tax_code",
    "loyalty_segment_id",
    "loyalty_segment_description",
    "ship_to_address"
)


In [0]:
# Duplicate check
df_dim_customer_final.groupBy("customer_key").count().filter("count > 1").display()


customer_key,count


In [0]:
# Overwrite gold table
(
    df_dim_customer_final
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(DIM_CUSTOMER_GOLD)
)


In [0]:
# Validation
spark.table(DIM_CUSTOMER_GOLD).display()


customer_key,customer_id,customer_name,tax_id,tax_code,loyalty_segment_id,loyalty_segment_description,ship_to_address
11123757#1532824233,11123757,"SMITH, SHIRLEY",,,3,level_3,"IN, 46506.0, N CENTER ST, 521.0"
30585978#1523100473,30585978,"STEPHENS, GERALDINE M",,,3,level_3,"OR, 0, NO SITUS, nan"
349822#1522922493,349822,"GUZMAN, CARMEN",,,0,level_0,"VA, 22181, HILL RD, 2860"
27652636#1531834357,27652636,"HASSETT, PATRICK J",,,1,level_1,"WI, 53058.0, IVY LANE, W333N 5591"
14437343#1517227530,14437343,"HENTZ, DIANA L",,,0,level_0,"OH, 43228.0, ALLIANCE WAY, 5706"
20441596#1519335250,20441596,"TIRADO, MARCO A",,,3,level_3,"NY, 13072, County Road 16, 2792"
5945686#1518988242,5945686,"SKORA, BRIAN S",,,1,level_1,"MI, 48205.0, E 8 MILE RD, 16414.0"
5385771#1518239268,5385771,"SLAWEK, DEAN J",,,3,level_3,"PA, 19147-3204, FITZWATER ST, 328"
1427940#1529087690,1427940,"REAVES, LIONEL C",,,2,level_2,"VA, 24445.0, HOT SPRINGS RD, 6419.0"
10457387#1535887733,10457387,"BONGIOVANNI, KELLY M",,,2,level_2,"IN, 47591, JERRY ST, 2006.0"
