In [0]:
# Bronze ingestion config
SOURCE_PATH = "dbfs:/databricks-datasets/retail-org/customers/"
TARGET_TABLE = "retail_project.bronze.customers"

In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Detect file format (standardized, Spark-safe)

files = dbutils.fs.ls(SOURCE_PATH)

# Ignore Spark metadata files and directories
data_files = [
    f.name.lower()
    for f in files
    if not f.name.startswith("_") and "." in f.name
]

if not data_files:
    raise ValueError(f"No data files found under {SOURCE_PATH}")

# Collect unique file extensions
extensions = {name.split(".")[-1] for name in data_files}

# Enforce single-format sources
if len(extensions) != 1:
    raise ValueError(
        f"Mixed or unsupported file types under {SOURCE_PATH}: {extensions}"
    )

FILE_FORMAT = extensions.pop()

# Allow only known formats
if FILE_FORMAT not in {"parquet", "csv", "json", "xml"}:
    raise ValueError(
        f"Unsupported file format '{FILE_FORMAT}' under {SOURCE_PATH}"
    )

print("Detected format:", FILE_FORMAT)

Detected format: csv


In [0]:
# Read raw CSV data
reader = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("inferSchema", "true")
         .option("mode", "PERMISSIVE")
)

df_raw = reader.load(SOURCE_PATH)

# Bronze enrichment (standard)
df_bronze = (
    df_raw
    .withColumn("_read_timestamp", F.current_timestamp())
    .withColumn("_source_path", F.col("_metadata.file_path"))
    .withColumn("_file_size", F.col("_metadata.file_size"))
)

display(df_bronze.limit(10))
df_bronze.printSchema()

customer_id,tax_id,tax_code,customer_name,state,city,postcode,street,number,unit,region,district,lon,lat,ship_to_address,valid_from,valid_to,units_purchased,loyalty_segment,_read_timestamp,_source_path,_file_size
11123757,,,"SMITH, SHIRLEY",IN,BREMEN,46506.0,N CENTER ST,521.0,,Indiana,50.0,-86.1465825,41.4507625,"IN, 46506.0, N CENTER ST, 521.0",1532824233,1548137353.0,34.0,3,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
30585978,,,"STEPHENS, GERALDINE M",OR,ADDRESS,0,NO SITUS,,,,,-122.1055158,45.374317,"OR, 0, NO SITUS, nan",1523100473,,18.0,3,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
349822,,,"GUZMAN, CARMEN",VA,VIENNA,22181,HILL RD,2860,,VA,,-77.2941261,38.88303270000001,"VA, 22181, HILL RD, 2860",1522922493,,5.0,0,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
27652636,,,"HASSETT, PATRICK J",WI,VILLAGE OF NASHOTAH,53058.0,IVY LANE,W333N 5591,,,,-88.40951700000002,43.1213789,"WI, 53058.0, IVY LANE, W333N 5591",1531834357,1558052195.0,7.0,1,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
14437343,,,"HENTZ, DIANA L",OH,COLUMBUS,43228.0,ALLIANCE WAY,5706,,OH,FRA,-83.158438,39.97821810000001,"OH, 43228.0, ALLIANCE WAY, 5706",1517227530,,0.0,0,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
20441596,,,"TIRADO, MARCO A",NY,Otselic,13072,County Road 16,2792,,NY,Chenango,-75.7505808,42.7172722,"NY, 13072, County Road 16, 2792",1519335250,,24.0,3,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
5945686,,,"SKORA, BRIAN S",MI,,48205.0,E 8 MILE RD,16414.0,,,,-82.950874,42.4499233,"MI, 48205.0, E 8 MILE RD, 16414.0",1518988242,,7.0,1,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
5385771,,,"SLAWEK, DEAN J",PA,,19147-3204,FITZWATER ST,328,,,,-75.14920550000002,39.9389473,"PA, 19147-3204, FITZWATER ST, 328",1518239268,,18.0,3,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
1427940,,,"REAVES, LIONEL C",VA,HOT SPRINGS,24445.0,HOT SPRINGS RD,6419.0,,,,-79.90497859999998,37.8949737,"VA, 24445.0, HOT SPRINGS RD, 6419.0",1529087690,,10.0,2,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361
10457387,,,"BONGIOVANNI, KELLY M",IN,VINCENNES,47591,JERRY ST,2006.0,,Indiana,42.0,-87.519002,38.662178,"IN, 47591, JERRY ST, 2006.0",1535887733,,9.0,2,2025-12-13T15:37:41.868Z,dbfs:/databricks-datasets/retail-org/customers/customers.csv,4550361


root
 |-- customer_id: integer (nullable = true)
 |-- tax_id: double (nullable = true)
 |-- tax_code: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- street: string (nullable = true)
 |-- number: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- region: string (nullable = true)
 |-- district: string (nullable = true)
 |-- lon: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- ship_to_address: string (nullable = true)
 |-- valid_from: integer (nullable = true)
 |-- valid_to: double (nullable = true)
 |-- units_purchased: double (nullable = true)
 |-- loyalty_segment: integer (nullable = true)
 |-- _read_timestamp: timestamp (nullable = false)
 |-- _source_path: string (nullable = false)
 |-- _file_size: long (nullable = false)



In [0]:
# Write to Delta Bronze table
(
    df_bronze.write
        .format("delta")
        .option("overwriteSchema", "true")  # Bronze schema is authoritative
        .mode("overwrite")                  # Full refresh
        .saveAsTable(TARGET_TABLE)
)

print(f"Wrote Bronze table: {TARGET_TABLE}")

Wrote Bronze table: retail_project.bronze.customers


In [0]:
# Quick validation
spark.sql(f"SELECT COUNT(*) AS row_count FROM {TARGET_TABLE}").show()

+---------+
|row_count|
+---------+
|    28813|
+---------+

