In [0]:
# Bronze ingestion config
SOURCE_PATH = "dbfs:/databricks-datasets/retail-org/purchase_orders/"
TARGET_TABLE = "retail_project.bronze.purchase_orders"

In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Detect file format (standardized, Spark-safe)

files = dbutils.fs.ls(SOURCE_PATH)

# Ignore Spark metadata files and directories
data_files = [
    f.name.lower()
    for f in files
    if not f.name.startswith("_") and "." in f.name
]

if not data_files:
    raise ValueError(f"No data files found under {SOURCE_PATH}")

# Collect unique file extensions
extensions = {name.split(".")[-1] for name in data_files}

# Enforce single-format sources
if len(extensions) != 1:
    raise ValueError(
        f"Mixed or unsupported file types under {SOURCE_PATH}: {extensions}"
    )

FILE_FORMAT = extensions.pop()

# Allow only known formats
if FILE_FORMAT not in {"parquet", "csv", "json", "xml"}:
    raise ValueError(
        f"Unsupported file format '{FILE_FORMAT}' under {SOURCE_PATH}"
    )

print("Detected format:", FILE_FORMAT)

Detected format: xml


In [0]:
display(spark.read.text(SOURCE_PATH))

value
Details About Mogitech G920 Xbox Driving Force Racing Wheel For Xbox One And Pc (941000121)
pcs
2198122550193
55019
1564619448
934213542
"JANUS, PAUL C"
RodC-B7u6EU
15
$ 39


In [0]:
# Read raw XML data
df_raw = (
    spark.read
         .format("xml")
         .option("rowTag", "purchase_item")  # one row per purchase_item
         .load(SOURCE_PATH)
)

# Bronze enrichment (standard)
df_bronze = (
    df_raw
    .withColumn("_read_timestamp", F.current_timestamp())
    .withColumn("_source_path", F.col("_metadata.file_path"))
    .withColumn("_file_size", F.col("_metadata.file_size"))
)

display(df_bronze.limit(10))
df_bronze.printSchema()

EAN13,EAN5,PO,datetime,password,price,product_name,product_unit,purchaser,quantity,supplier,_read_timestamp,_source_path,_file_size
2198122550193,55019,934213542,1564619448,RodC-B7u6EU,$ 39,Details About Mogitech G920 Xbox Driving Force Racing Wheel For Xbox One And Pc (941000121),pcs,"JANUS, PAUL C",15,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550745,55074,934214594,1564620839,!mVJZ`$&,$ 17,Mogitech - Harmony 950 Universal Remote - Black,pcs,"CARMEN, ZACHARY C",5,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550745,55074,934213362,1564621555,vCa>vq{I,$ 24,Mogitech - Harmony 950 Universal Remote - Black,pcs,"PHILLIPS, DEBORAH A",15,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550038,55003,934213097,1564625523,/16kw54A^,$ 28,Mogitech Ultrathin Touch Mouse T630 for Windows 8 Touch Gestures,pcs,"HARRIS, THOMAS H",10,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122549911,54991,934212803,1564629919,~-,$ 30,Mogitech Keys-To-Go Ultra-Portable Bluetooth Keyboard for Android and Windows,pcs,"BRADLEY, VALENA M",5,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550038,55003,934214061,1564634803,g?Khs]=&Q\S,$ 26,Mogitech Ultrathin Touch Mouse T630 for Windows 8 Touch Gestures,pcs,"VERA, JESSICA C",15,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550038,55003,934211731,1564635937,oLMzV,YEN 17,Mogitech Ultrathin Touch Mouse T630 for Windows 8 Touch Gestures,pcs,"JACKSON, ANTHONY N",10,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550745,55074,934213553,1564635966,%XTI(*(t,$ 10,Mogitech - Harmony 950 Universal Remote - Black,pcs,"STAGOWSKI, JAMES",30,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550038,55003,934213308,1564636463,?^]`C;3~@L,YEN 52,Mogitech Ultrathin Touch Mouse T630 for Windows 8 Touch Gestures,pcs,"HOUSE, JACQUELYN B",5,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235
2198122550038,55003,934214388,1564641136,"},OU*^J%G",$ 21,Mogitech Ultrathin Touch Mouse T630 for Windows 8 Touch Gestures,pcs,"RIVERA, JOSE L",50,Mogitech,2025-12-13T15:44:29.968Z,dbfs:/databricks-datasets/retail-org/purchase_orders/purchase_orders.xml,1819235


root
 |-- EAN13: long (nullable = true)
 |-- EAN5: long (nullable = true)
 |-- PO: long (nullable = true)
 |-- datetime: long (nullable = true)
 |-- password: string (nullable = true)
 |-- price: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_unit: string (nullable = true)
 |-- purchaser: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- supplier: string (nullable = true)
 |-- _read_timestamp: timestamp (nullable = false)
 |-- _source_path: string (nullable = false)
 |-- _file_size: long (nullable = false)



In [0]:
# Write to Delta Bronze table
(
    df_bronze.write
        .format("delta")
        .option("overwriteSchema", "true")  # Bronze schema is authoritative
        .mode("overwrite")                  # Full refresh
        .saveAsTable(TARGET_TABLE)
)

print(f"Wrote Bronze table: {TARGET_TABLE}")
print(f"Wrote Bronze table: {TARGET_TABLE}")

Wrote Bronze table: retail_project.bronze.purchase_orders


In [0]:
# Quick validation
spark.sql(f"SELECT COUNT(*) AS row_count FROM {TARGET_TABLE}").show()

+---------+
|row_count|
+---------+
|     3973|
+---------+

