In [0]:
# Bronze ingestion config
SOURCE_PATH = "dbfs:/databricks-datasets/retail-org/loyalty_segments/"
TARGET_TABLE = "retail_project.bronze.loyalty_segments"

In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Detect file format (standardized, Spark-safe)

files = dbutils.fs.ls(SOURCE_PATH)

# Ignore Spark metadata files and directories
data_files = [
    f.name.lower()
    for f in files
    if not f.name.startswith("_") and "." in f.name
]

if not data_files:
    raise ValueError(f"No data files found under {SOURCE_PATH}")

# Collect unique file extensions
extensions = {name.split(".")[-1] for name in data_files}

# Enforce single-format sources
if len(extensions) != 1:
    raise ValueError(
        f"Mixed or unsupported file types under {SOURCE_PATH}: {extensions}"
    )

FILE_FORMAT = extensions.pop()

# Allow only known formats
if FILE_FORMAT not in {"parquet", "csv", "json", "xml"}:
    raise ValueError(
        f"Unsupported file format '{FILE_FORMAT}' under {SOURCE_PATH}"
    )

print("Detected format:", FILE_FORMAT)

Detected format: csv


In [0]:
# Read raw CSV data
reader = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("inferSchema", "true")
         .option("mode", "PERMISSIVE")
)

df_raw = reader.load(SOURCE_PATH)

# Bronze enrichment (standard)
df_bronze = (
    df_raw
    .withColumn("_read_timestamp", F.current_timestamp())
    .withColumn("_source_path", F.col("_metadata.file_path"))
    .withColumn("_file_size", F.col("_metadata.file_size"))
)

display(df_bronze.limit(10))
df_bronze.printSchema()

loyalty_segment_id,loyalty_segment_description,unit_threshold,valid_from,valid_to,_read_timestamp,_source_path,_file_size
0,level_0,0,2017-01-01,,2025-12-13T16:14:48.947Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181
1,level_1,10,2017-01-01,,2025-12-13T16:14:48.947Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181
2,level_2,30,2017-01-01,,2025-12-13T16:14:48.947Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181
3,level_3,70,2017-01-01,,2025-12-13T16:14:48.947Z,dbfs:/databricks-datasets/retail-org/loyalty_segments/loyalty_segment.csv,181


root
 |-- loyalty_segment_id: integer (nullable = true)
 |-- loyalty_segment_description: string (nullable = true)
 |-- unit_threshold: integer (nullable = true)
 |-- valid_from: date (nullable = true)
 |-- valid_to: string (nullable = true)
 |-- _read_timestamp: timestamp (nullable = false)
 |-- _source_path: string (nullable = false)
 |-- _file_size: long (nullable = false)



In [0]:
# Write to Delta Bronze table
(
    df_bronze.write
        .format("delta")
        .option("overwriteSchema", "true")  # Bronze schema is authoritative
        .mode("overwrite")                  # Full refresh
        .saveAsTable(TARGET_TABLE)
)

print(f"Wrote Bronze table: {TARGET_TABLE}")

Wrote Bronze table: retail_project.bronze.loyalty_segments


In [0]:
# Quick validation
spark.sql(f"SELECT COUNT(*) AS row_count FROM {TARGET_TABLE}").show()

+---------+
|row_count|
+---------+
|        4|
+---------+

