In [0]:
# Imports
from pyspark.sql import functions as F


In [0]:
# Read Bronze products
df_products_bronze = spark.table("retail_project.bronze.products")

In [0]:
display(df_products_bronze.limit(10))
df_products_bronze.printSchema()

product_id,product_category,product_name,sales_price,EAN13,EAN5,product_unit,_read_timestamp,_source_path,_file_size
AV1YFjNZglJLPUi8IGc8,Sioneer,Sioneer - 8x External USB 3.0 Quad-Layer Blu-ray Disc DL DVD±RW/CD-RW Drive - Silver,75.88285714285716,2198122549848,54984,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe48Es1cnluZ0-ZHZU,Zamaha,Zamaha RX-V479BL 5.1-Channel AV Receiver (Black),366.6433333333334,2198122549857,54985,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe6-C2LJeJML43yWf1,Ramsung,Ramsung Radiant360 R1 Wi-Fi/Bluetooth Speaker WAM1500/ZA - Black (Certified Refurbished),146.55416666666667,2198122549862,54986,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe6jFBilAPnD_xQxO2,Olitscreens,Aeon 71.5 x 130.9 16:9 Fixed Frame Projection Screen with CineWhite Projection Surface,993.8233333333332,2198122549875,54987,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe77ZXLJeJML43ysuz,Rony,4GB NW-WS413 Sports Walkman Digital Music Player (Black),86.07833333333332,2198122549889,54988,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe7V2GLJeJML43yfVz,Karsair,Karsair - AX760 760-Watt ATX Power Supply - Black,169.99,2198122549898,54989,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe7f2rLJeJML43yi3P,Zamaha,NS-PA40 5.1-Channel Speaker System (Black),349.97,2198122549904,54990,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe7vER1cnluZ0-aJu7,Mogitech,Mogitech Keys-To-Go Ultra-Portable Bluetooth Keyboard for Android and Windows,60.70428571428572,2198122549911,54991,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe9oDr1cnluZ0-a1wt,Ramsung,Ramsung - 960 PRO 512GB Internal PCI Express 3.0 x4 (NVMe) Solid State Drive for Laptops,306.9182352941175,2198122549921,54992,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe_9aLilAPnD_xStSC,Sioneer,SP-C22 Andrew Jones Designed Center Channel Speaker,95.84857142857145,2198122549933,54993,pcs,2025-12-13T15:39:54.516Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506


root
 |-- product_id: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- sales_price: double (nullable = true)
 |-- EAN13: long (nullable = true)
 |-- EAN5: integer (nullable = true)
 |-- product_unit: string (nullable = true)
 |-- _read_timestamp: timestamp (nullable = true)
 |-- _source_path: string (nullable = true)
 |-- _file_size: long (nullable = true)



In [0]:
# validate product_id
(
    df_products_bronze
    .groupBy("product_id")
    .count()
    .orderBy(F.desc("count"))
    .limit(10)
    .display()
)

product_id,count
AVpe9oDr1cnluZ0-a1wt,1
AVpe7V2GLJeJML43yfVz,1
AVpe7f2rLJeJML43yi3P,1
AVpe7vER1cnluZ0-aJu7,1
AVpe48Es1cnluZ0-ZHZU,1
AVpe77ZXLJeJML43ysuz,1
AV1YFjNZglJLPUi8IGc8,1
AVpe6jFBilAPnD_xQxO2,1
AVpe6-C2LJeJML43yWf1,1
AVpe_9aLilAPnD_xStSC,1


In [0]:
# validate product_unit
(
    df_products_bronze
    .groupBy("product_unit")
    .count()
    .orderBy(F.desc("count"))
    .limit(10)
    .display()
)

product_unit,count
pcs,98


In [0]:
# Discovery check: nulls & data quality
df_products_bronze.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df_products_bronze.columns
]).display()

product_id,product_category,product_name,sales_price,EAN13,EAN5,product_unit,_read_timestamp,_source_path,_file_size
0,0,0,0,0,0,0,0,0,0


In [0]:
# None in backfill but just in case in future
df_products_deduped = df_products_bronze.dropDuplicates(["product_id"])

In [0]:
# Silver cleaning & standardization
df_products_silver = (
    df_products_deduped
    .select(
        F.col("product_id").cast("string"),
        F.col("product_category").cast("string"),
        F.col("product_name").cast("string"),

        # round + fix type
        F.round(F.col("sales_price"), 2)
            .cast("decimal(18,2)")
            .alias("sales_price"),

        F.col("EAN13").cast("long"),
        F.col("EAN5").cast("integer"),
        F.col("product_unit").cast("string"),
        F.col("_read_timestamp").alias("bronze_read_timestamp")
    )
)


In [0]:
# Create product_key (added business key)
df_products_silver = (
    df_products_silver
    .withColumn(
        "product_key",
        F.col("product_id")
    )
)

In [0]:
# column reordering
df_products_silver = df_products_silver.select(
    "product_key",
    "product_id",
    "product_category",
    "product_name",
    "sales_price",
    "EAN13",
    "EAN5",
    "product_unit",
    "bronze_read_timestamp"
)


In [0]:
# Check product_id uniqueness
df_products_bronze.groupBy("product_id").count().filter("count > 1").display()


product_id,count


In [0]:
(
    df_products_silver.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable("retail_project.silver.products")
)


In [0]:
spark.table("retail_project.silver.products").display()

product_key,product_id,product_category,product_name,sales_price,EAN13,EAN5,product_unit,bronze_read_timestamp
AVpfYKih1cnluZ0-jsHP,AVpfYKih1cnluZ0-jsHP,Karsair,CORSAIR HYDRO SERIES H100i v2 AIO Liquid CPU Cooler,122.72,2198122550173,55017,pcs,2025-12-13T15:39:54.516Z
AVpe77ZXLJeJML43ysuz,AVpe77ZXLJeJML43ysuz,Rony,4GB NW-WS413 Sports Walkman Digital Music Player (Black),86.08,2198122549889,54988,pcs,2025-12-13T15:39:54.516Z
AVpe_9aLilAPnD_xStSC,AVpe_9aLilAPnD_xStSC,Sioneer,SP-C22 Andrew Jones Designed Center Channel Speaker,95.85,2198122549933,54993,pcs,2025-12-13T15:39:54.516Z
AVwjhoD2v8e3D1O-nnNv,AVwjhoD2v8e3D1O-nnNv,Ankyo,Ankyo - TX 7.2-Ch. Network-Ready A/V Home Theater Receiver - Black,376.48,2198122550774,55077,pcs,2025-12-13T15:39:54.516Z
AVwjdOdov8e3D1O-nnK9,AVwjdOdov8e3D1O-nnK9,Ramsung,UBD-M9500 HDR UHD Upscaling Blu-ray Disc Player,312.13,2198122550764,55076,pcs,2025-12-13T15:39:54.516Z
AVqFXQH5nnc1JgDc3gBZ,AVqFXQH5nnc1JgDc3gBZ,Karsair,Karsair - VENGEANCE LED Series 16GB (2PK 8GB) 3.0GHz DDR4 Desktop Memory with LED Lighting - Black,181.49,2198122550693,55069,pcs,2025-12-13T15:39:54.516Z
AVpfAXof1cnluZ0-bz3u,AVpfAXof1cnluZ0-bz3u,Ramsung,Ramsung - 960 Pro 1TB Internal PCI Express 3.0 x4 (NVMe 1.1) Solid State Drive,611.73,2198122549988,54998,pcs,2025-12-13T15:39:54.516Z
AVpfdBS41cnluZ0-lBIj,AVpfdBS41cnluZ0-lBIj,Mogitech,Details About Mogitech G920 Xbox Driving Force Racing Wheel For Xbox One And Pc (941000121),293.86,2198122550193,55019,pcs,2025-12-13T15:39:54.516Z
AVpgRiy2LJeJML43Lk7h,AVpgRiy2LJeJML43Lk7h,Sioneer,Sioneer GM-D8601 Class D Mono Amplifier with Wired Bass Boost Remote,145.0,2198122550412,55041,pcs,2025-12-13T15:39:54.516Z
AVpgshBg1cnluZ0-5JlU,AVpgshBg1cnluZ0-5JlU,Apson,Apson PowerLite 740HD LCD Projector - 720p - HDTV - 16:10 V11H764020,448.13,2198122550476,55047,pcs,2025-12-13T15:39:54.516Z
