In [0]:
# Bronze ingestion config
SOURCE_PATH = "dbfs:/databricks-datasets/retail-org/active_promotions/"
TARGET_TABLE = "retail_project.bronze.active_promotions"

In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Detect file format (standardized, Spark-safe)

files = dbutils.fs.ls(SOURCE_PATH)

# Ignore Spark metadata files and directories
data_files = [
    f.name.lower()
    for f in files
    if not f.name.startswith("_") and "." in f.name
]

if not data_files:
    raise ValueError(f"No data files found under {SOURCE_PATH}")

# Collect unique file extensions
extensions = {name.split(".")[-1] for name in data_files}

# Enforce single-format sources
if len(extensions) != 1:
    raise ValueError(
        f"Mixed or unsupported file types under {SOURCE_PATH}: {extensions}"
    )

FILE_FORMAT = extensions.pop()

# Allow only known formats
if FILE_FORMAT not in {"parquet", "csv", "json", "xml"}:
    raise ValueError(
        f"Unsupported file format '{FILE_FORMAT}' under {SOURCE_PATH}"
    )

print("Detected format:", FILE_FORMAT)

Detected format: parquet


In [0]:
# Read raw Parquet data
df_raw = spark.read.parquet(SOURCE_PATH)

# Bronze enrichment (standard)
df_bronze = (
    df_raw
    .withColumn("_read_timestamp", F.current_timestamp())
    .withColumn("_source_path", F.col("_metadata.file_path"))
    .withColumn("_file_size", F.col("_metadata.file_size"))
)

display(df_bronze.limit(10))
df_bronze.printSchema()

promo_customer,promo_item,promo_disc,promo_id,promo_datetime,promo_qty,cumsum,promo_began,units_required,eligible,deadline,_read_timestamp,_source_path,_file_size
10294348,AVqVGUFCv8e3D1O-ldFF,0.05,1,1573507549,2,2,1573507549,5,0,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfMVD-ilAPnD_xW6bu,0.03,0,1564694959,1,1,1564694959,3,0,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfMVD-ilAPnD_xW6bu,0.03,0,1565691984,1,2,1564694959,3,0,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfMVD-ilAPnD_xW6bu,0.03,0,1566880480,2,4,1564694959,3,1,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfMVD-ilAPnD_xW6bu,0.03,0,1569446371,3,7,1564694959,3,1,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfMVD-ilAPnD_xW6bu,0.03,0,1573035682,1,8,1564694959,3,1,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfPEx61cnluZ0-gyT9,0.03,0,1571641098,2,2,1571641098,3,0,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfPEx61cnluZ0-gyT9,0.03,0,1571824270,2,4,1571641098,3,1,,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpfYKih1cnluZ0-jsHP,0.07,2,1568287781,2,2,1568287781,4,0,2019-10-01 01:00:00,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390
10393805,AVpge6k2LJeJML43OhAl,0.07,2,1567989318,2,2,1567989318,4,0,2019-10-01 01:00:00,2025-12-13T16:14:22.023Z,dbfs:/databricks-datasets/retail-org/active_promotions/active_promotions.parquet,15390


root
 |-- promo_customer: string (nullable = true)
 |-- promo_item: string (nullable = true)
 |-- promo_disc: double (nullable = true)
 |-- promo_id: long (nullable = true)
 |-- promo_datetime: string (nullable = true)
 |-- promo_qty: long (nullable = true)
 |-- cumsum: long (nullable = true)
 |-- promo_began: string (nullable = true)
 |-- units_required: long (nullable = true)
 |-- eligible: long (nullable = true)
 |-- deadline: string (nullable = true)
 |-- _read_timestamp: timestamp (nullable = false)
 |-- _source_path: string (nullable = false)
 |-- _file_size: long (nullable = false)



In [0]:
# Write to Delta Bronze table
(
    df_bronze.write
        .format("delta")
        .option("overwriteSchema", "true")  # Bronze schema is authoritative
        .mode("overwrite")                  # Full refresh
        .saveAsTable(TARGET_TABLE)
)

print(f"Wrote Bronze table: {TARGET_TABLE}")

Wrote Bronze table: retail_project.bronze.active_promotions


In [0]:
# Quick validation
spark.sql(f"SELECT COUNT(*) AS row_count FROM {TARGET_TABLE}").show()

+---------+
|row_count|
+---------+
|      423|
+---------+

