In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Read Bronze customers
df_customers_bronze = spark.table("retail_project.bronze.customers")

In [0]:
display(df_customers_bronze.limit(10))
df_customers_bronze.printSchema()

customer_id,tax_id,tax_code,customer_name,state,city,postcode,street,number,unit,region,district,lon,lat,ship_to_address,valid_from,valid_to,units_purchased,loyalty_segment,_bronze_load_ts,_bronze_source_path,_bronze_file_size,_read_timestamp,_source_path,_file_size
11123757,,,"SMITH, SHIRLEY",IN,BREMEN,46506.0,N CENTER ST,521.0,,Indiana,50.0,-86.1465825,41.4507625,"IN, 46506.0, N CENTER ST, 521.0",1532824233,1548137353.0,34.0,3,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
30585978,,,"STEPHENS, GERALDINE M",OR,ADDRESS,0,NO SITUS,,,,,-122.1055158,45.374317,"OR, 0, NO SITUS, nan",1523100473,,18.0,3,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
349822,,,"GUZMAN, CARMEN",VA,VIENNA,22181,HILL RD,2860,,VA,,-77.2941261,38.88303270000001,"VA, 22181, HILL RD, 2860",1522922493,,5.0,0,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
27652636,,,"HASSETT, PATRICK J",WI,VILLAGE OF NASHOTAH,53058.0,IVY LANE,W333N 5591,,,,-88.40951700000002,43.1213789,"WI, 53058.0, IVY LANE, W333N 5591",1531834357,1558052195.0,7.0,1,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
14437343,,,"HENTZ, DIANA L",OH,COLUMBUS,43228.0,ALLIANCE WAY,5706,,OH,FRA,-83.158438,39.97821810000001,"OH, 43228.0, ALLIANCE WAY, 5706",1517227530,,0.0,0,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
20441596,,,"TIRADO, MARCO A",NY,Otselic,13072,County Road 16,2792,,NY,Chenango,-75.7505808,42.7172722,"NY, 13072, County Road 16, 2792",1519335250,,24.0,3,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
5945686,,,"SKORA, BRIAN S",MI,,48205.0,E 8 MILE RD,16414.0,,,,-82.950874,42.4499233,"MI, 48205.0, E 8 MILE RD, 16414.0",1518988242,,7.0,1,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
5385771,,,"SLAWEK, DEAN J",PA,,19147-3204,FITZWATER ST,328,,,,-75.14920550000002,39.9389473,"PA, 19147-3204, FITZWATER ST, 328",1518239268,,18.0,3,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
1427940,,,"REAVES, LIONEL C",VA,HOT SPRINGS,24445.0,HOT SPRINGS RD,6419.0,,,,-79.90497859999998,37.8949737,"VA, 24445.0, HOT SPRINGS RD, 6419.0",1529087690,,10.0,2,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
10457387,,,"BONGIOVANNI, KELLY M",IN,VINCENNES,47591,JERRY ST,2006.0,,Indiana,42.0,-87.519002,38.662178,"IN, 47591, JERRY ST, 2006.0",1535887733,,9.0,2,,,,2025-12-13T15:38:02.230Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361


root
 |-- customer_id: integer (nullable = true)
 |-- tax_id: double (nullable = true)
 |-- tax_code: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- street: string (nullable = true)
 |-- number: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- region: string (nullable = true)
 |-- district: string (nullable = true)
 |-- lon: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- ship_to_address: string (nullable = true)
 |-- valid_from: integer (nullable = true)
 |-- valid_to: double (nullable = true)
 |-- units_purchased: double (nullable = true)
 |-- loyalty_segment: integer (nullable = true)
 |-- _bronze_load_ts: timestamp (nullable = true)
 |-- _bronze_source_path: string (nullable = true)
 |-- _bronze_file_size: long (nullable = true)
 |-- _read_timestamp: timestamp (nullable = true)
 |-- _source_path: string (nulla

In [0]:
# validate customer_id
(
    df_customers_bronze
    .groupBy("customer_id")
    .count()
    .orderBy(F.desc("count"))
    .limit(10)
    .display()
)

customer_id,count
7237214,2
2388421,2
17286556,2
915104,2
12703248,2
5260201,2
18654833,2
9440771,2
2826093,2
12655486,2


In [0]:
# Discovery check: nulls & data quality
df_customers_bronze.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df_customers_bronze.columns
]).display()

customer_id,tax_id,tax_code,customer_name,state,city,postcode,street,number,unit,region,district,lon,lat,ship_to_address,valid_from,valid_to,units_purchased,loyalty_segment,_bronze_load_ts,_bronze_source_path,_bronze_file_size,_read_timestamp,_source_path,_file_size
0,19389,19389,0,0,4765,0,0,123,25613,15152,16732,0,0,0,0,27380,0,0,28813,28813,28813,0,0,0


In [0]:
# Create customer_key as new PK
# customer_key = customer_id + '#' + valid_from
df_customers_with_key = (
    df_customers_bronze
    .withColumn(
        "customer_key",
        F.concat_ws(
            "#",
            F.col("customer_id"),
            F.col("valid_from").cast("string")
        )
    )
)

In [0]:
# df_customers_deduped = df_customers_bronze.dropDuplicates(["customer_id"])


In [0]:
# Silver cleaning & standardization
df_customers_silver = (
    df_customers_with_key
    .select(
        # Added PK: customer_key
        F.col("customer_key").cast("string"),

        F.col("valid_to"),
        F.col("tax_id").cast("string"),
        F.col("tax_code").cast("string"),
        F.col("customer_name").cast("string"),
        F.col("ship_to_address").cast("string"),

        # changed type from double -> int
        F.col("units_purchased").cast("int"),

        F.col("loyalty_segment").cast("string"),

        F.col("customer_id").cast("string"),
        F.col("_read_timestamp").alias("bronze_read_timestamp")
    )
)

In [0]:
# Check product_id uniqueness
df_customers_silver.groupBy("customer_key").count().filter("count > 1").display()


customer_key,count


In [0]:
(
    df_customers_silver.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable("retail_project.silver.customers")
)


In [0]:
spark.table("retail_project.silver.customers").display()


customer_key,valid_to,tax_id,tax_code,customer_name,ship_to_address,units_purchased,loyalty_segment,customer_id,bronze_read_timestamp
11123757#1532824233,1548137353.0,,,"SMITH, SHIRLEY","IN, 46506.0, N CENTER ST, 521.0",34,3,11123757,2025-12-13T15:38:02.230Z
30585978#1523100473,,,,"STEPHENS, GERALDINE M","OR, 0, NO SITUS, nan",18,3,30585978,2025-12-13T15:38:02.230Z
349822#1522922493,,,,"GUZMAN, CARMEN","VA, 22181, HILL RD, 2860",5,0,349822,2025-12-13T15:38:02.230Z
27652636#1531834357,1558052195.0,,,"HASSETT, PATRICK J","WI, 53058.0, IVY LANE, W333N 5591",7,1,27652636,2025-12-13T15:38:02.230Z
14437343#1517227530,,,,"HENTZ, DIANA L","OH, 43228.0, ALLIANCE WAY, 5706",0,0,14437343,2025-12-13T15:38:02.230Z
20441596#1519335250,,,,"TIRADO, MARCO A","NY, 13072, County Road 16, 2792",24,3,20441596,2025-12-13T15:38:02.230Z
5945686#1518988242,,,,"SKORA, BRIAN S","MI, 48205.0, E 8 MILE RD, 16414.0",7,1,5945686,2025-12-13T15:38:02.230Z
5385771#1518239268,,,,"SLAWEK, DEAN J","PA, 19147-3204, FITZWATER ST, 328",18,3,5385771,2025-12-13T15:38:02.230Z
1427940#1529087690,,,,"REAVES, LIONEL C","VA, 24445.0, HOT SPRINGS RD, 6419.0",10,2,1427940,2025-12-13T15:38:02.230Z
10457387#1535887733,,,,"BONGIOVANNI, KELLY M","IN, 47591, JERRY ST, 2006.0",9,2,10457387,2025-12-13T15:38:02.230Z
