In [0]:
# Bronze ingestion config
SOURCE_PATH = "dbfs:/databricks-datasets/retail-org/company_employees/"
TARGET_TABLE = "retail_project.bronze.company_employees"

In [0]:
# Imports
from pyspark.sql import functions as F

In [0]:
# Detect file format
files = dbutils.fs.ls(SOURCE_PATH)
file_names = [f.name for f in files if not f.name.endswith("/")]

if not file_names:
    raise ValueError(f"No files found under {SOURCE_PATH}")

# pick a representative file
sample = file_names[0].lower()

if sample.endswith(".parquet"):
    FILE_FORMAT = "parquet"
elif sample.endswith(".csv"):
    FILE_FORMAT = "csv"
elif sample.endswith(".json"):
    FILE_FORMAT = "json"
else:
    raise ValueError(f"Unsupported/unknown file type in {SOURCE_PATH}: {sample}")

print("Detected format:", FILE_FORMAT)

Detected format: csv


In [0]:
# Read raw CSV data
reader = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("inferSchema", "true")
         .option("mode", "PERMISSIVE")
)

df_raw = reader.load(SOURCE_PATH)

# Bronze enrichment (standard)
df_bronze = (
    df_raw
    .withColumn("_read_timestamp", F.current_timestamp())
    .withColumn("_source_path", F.col("_metadata.file_path"))
    .withColumn("_file_size", F.col("_metadata.file_size"))
)

display(df_bronze.limit(10))
df_bronze.printSchema()

employee_id,employee_name,department,region,employee_key,active_record,active_record_start,active_record_end,_read_timestamp,_source_path,_file_size
0,"COOK, KIMBERLY",SALES,Northeast,48001,1,2010-10-30,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
1,"BOYLAN, CHAD A",SALES,Northeast,48002,1,2019-03-04,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
2,"SANDOVAL JR, DANIEL",SALES,Northeast,48003,0,2016-07-19,2017-04-19,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
3,"APOSTOLOS, CONSTANTIN E",SALES,Northeast,48004,0,2011-09-06,2012-06-06,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
4,"LIPINSKI, TINA L",SALES,Northeast,48005,1,2017-01-25,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
5,"DAHLSTROM, SCOTT M",SALES,Northeast,48006,1,2013-10-22,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
6,"ORTIZ, FRANCISCO",SALES,Northeast,48007,1,2013-06-09,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
7,"LOZADA, JOSUE",SALES,Northeast,48008,1,2017-01-23,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
8,"MC GHEE, CYNTHIA L",SALES,Northeast,48009,1,2019-10-23,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990
9,"BELL, REGINALD V",SALES,Northeast,48010,1,2015-03-31,,2025-12-13T16:15:37.849Z,dbfs:/databricks-datasets/retail-org/company_employees/company_employees.csv,27990


root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- region: string (nullable = true)
 |-- employee_key: integer (nullable = true)
 |-- active_record: integer (nullable = true)
 |-- active_record_start: date (nullable = true)
 |-- active_record_end: date (nullable = true)
 |-- _read_timestamp: timestamp (nullable = false)
 |-- _source_path: string (nullable = false)
 |-- _file_size: long (nullable = false)



In [0]:
# Write to Delta Bronze table
(
    df_bronze.write
        .format("delta")
        .option("overwriteSchema", "true")  # Bronze schema is authoritative
        .mode("overwrite")                  # Full refresh
        .saveAsTable(TARGET_TABLE)
)

print(f"Wrote Bronze table: {TARGET_TABLE}")

Wrote Bronze table: retail_project.bronze.company_employees


In [0]:
# Quick validation
spark.sql(f"SELECT COUNT(*) AS row_count FROM {TARGET_TABLE}").show()

+---------+
|row_count|
+---------+
|      510|
+---------+

