In [0]:
# Bronze ingestion config
SOURCE_PATH = "dbfs:/databricks-datasets/retail-org/products/"
TARGET_TABLE = "retail_project.bronze.products"

In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Detect file format (standardized, Spark-safe)

files = dbutils.fs.ls(SOURCE_PATH)

# Ignore Spark metadata files and directories
data_files = [
    f.name.lower()
    for f in files
    if not f.name.startswith("_") and "." in f.name
]

if not data_files:
    raise ValueError(f"No data files found under {SOURCE_PATH}")

# Collect unique file extensions
extensions = {name.split(".")[-1] for name in data_files}

# Enforce single-format sources
if len(extensions) != 1:
    raise ValueError(
        f"Mixed or unsupported file types under {SOURCE_PATH}: {extensions}"
    )

FILE_FORMAT = extensions.pop()

# Allow only known formats
if FILE_FORMAT not in {"parquet", "csv", "json", "xml"}:
    raise ValueError(
        f"Unsupported file format '{FILE_FORMAT}' under {SOURCE_PATH}"
    )

print("Detected format:", FILE_FORMAT)

Detected format: csv


In [0]:
# Read raw CSV data (semicolon-delimited)
reader = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("delimiter", ";")   # dataset-specific quirk
         .option("inferSchema", "true")
         .option("mode", "PERMISSIVE")
)

df_raw = reader.load(SOURCE_PATH)

# Bronze enrichment (standard)
df_bronze = (
    df_raw
    .withColumn("_read_timestamp", F.current_timestamp())
    .withColumn("_source_path", F.col("_metadata.file_path"))
    .withColumn("_file_size", F.col("_metadata.file_size"))
)

display(df_bronze.limit(10))
df_bronze.printSchema()

product_id,product_category,product_name,sales_price,EAN13,EAN5,product_unit,_read_timestamp,_source_path,_file_size
AV1YFjNZglJLPUi8IGc8,Sioneer,Sioneer - 8x External USB 3.0 Quad-Layer Blu-ray Disc DL DVD±RW/CD-RW Drive - Silver,75.88285714285716,2198122549848,54984,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe48Es1cnluZ0-ZHZU,Zamaha,Zamaha RX-V479BL 5.1-Channel AV Receiver (Black),366.6433333333334,2198122549857,54985,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe6-C2LJeJML43yWf1,Ramsung,Ramsung Radiant360 R1 Wi-Fi/Bluetooth Speaker WAM1500/ZA - Black (Certified Refurbished),146.55416666666667,2198122549862,54986,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe6jFBilAPnD_xQxO2,Olitscreens,Aeon 71.5 x 130.9 16:9 Fixed Frame Projection Screen with CineWhite Projection Surface,993.8233333333332,2198122549875,54987,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe77ZXLJeJML43ysuz,Rony,4GB NW-WS413 Sports Walkman Digital Music Player (Black),86.07833333333332,2198122549889,54988,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe7V2GLJeJML43yfVz,Karsair,Karsair - AX760 760-Watt ATX Power Supply - Black,169.99,2198122549898,54989,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe7f2rLJeJML43yi3P,Zamaha,NS-PA40 5.1-Channel Speaker System (Black),349.97,2198122549904,54990,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe7vER1cnluZ0-aJu7,Mogitech,Mogitech Keys-To-Go Ultra-Portable Bluetooth Keyboard for Android and Windows,60.70428571428572,2198122549911,54991,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe9oDr1cnluZ0-a1wt,Ramsung,Ramsung - 960 PRO 512GB Internal PCI Express 3.0 x4 (NVMe) Solid State Drive for Laptops,306.9182352941175,2198122549921,54992,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506
AVpe_9aLilAPnD_xStSC,Sioneer,SP-C22 Andrew Jones Designed Center Channel Speaker,95.84857142857145,2198122549933,54993,pcs,2025-12-13T15:39:40.661Z,dbfs:/databricks-datasets/retail-org/products/products.csv,12506


root
 |-- product_id: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- sales_price: double (nullable = true)
 |-- EAN13: long (nullable = true)
 |-- EAN5: integer (nullable = true)
 |-- product_unit: string (nullable = true)
 |-- _read_timestamp: timestamp (nullable = false)
 |-- _source_path: string (nullable = false)
 |-- _file_size: long (nullable = false)



In [0]:
# Write to Delta Bronze table
(
    df_bronze.write
        .format("delta")
        .option("overwriteSchema", "true")  # Bronze schema is authoritative
        .mode("overwrite")                  # Full refresh
        .saveAsTable(TARGET_TABLE)
)

print(f"Wrote Bronze table: {TARGET_TABLE}")

Wrote Bronze table: retail_project.bronze.products


In [0]:
# Quick validation
spark.sql(f"SELECT COUNT(*) AS row_count FROM {TARGET_TABLE}").show()

+---------+
|row_count|
+---------+
|       98|
+---------+

