In [0]:
from pyspark.sql import functions as F
import re

CATALOG_DB = "sales_db"
BRONZE_SCHEMA = "bronze"
TABLE_NAME = "orders_raw"
BRONZE_INPUT_PATH = "abfss://{bronze_container}@{storage_account}.dfs.core.windows.net/{input_file}"  

bronze_table = f"{CATALOG_DB}.{BRONZE_SCHEMA}.{TABLE_NAME}"

In [0]:
# create Database and schema
spark.sql(f"CREATE DATABASE IF NOT EXISTS {CATALOG_DB}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_DB}.{BRONZE_SCHEMA}")

DataFrame[]

In [0]:
# Read CSV
df = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .option("multiLine", "true")
         .option("escape", "\"")
         .option("quote", "\"")
         .csv(BRONZE_INPUT_PATH)
)

df.show(5, truncate=False)

+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+-----------------------------------------------------------+--------+
|Row ID|Order ID      |Order Date|Ship Date |Ship Mode     |Customer ID|Customer Name  |Segment  |Country      |City           |State     |Postal Code|Region|Product ID     |Category       |Sub-Category|Product Name                                               |Sales   |
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+-----------------------------------------------------------+--------+
|1     |CA-2017-152156|2017-11-08|2017-11-11|Second Class  |CG-12520   |Claire Gute    |Consumer |United States|Henderson      |Kentucky  |42420      |South |FUR-BO-10001798|Furnitu

In [0]:
# Sanitize column names 
def sanitize_name(c: str) -> str:
    c2 = c.strip()
    c2 = re.sub(r"[ ,;{}()\n\t=]", "_", c2)  
    c2 = re.sub(r"[^0-9a-zA-Z_]", "_", c2)   
    c2 = re.sub(r"_+", "_", c2)              
    c2 = c2.strip("_").lower()
    return c2

sanitized_cols = [sanitize_name(c) for c in df.columns]
df = df.toDF(*sanitized_cols)


# Add ingestion metadata 
bronze_df = (
    df.withColumn("ingest_ts", F.current_timestamp())
      .withColumn("source_file", F.lit("Sales_data.csv"))
)

In [0]:
# Write to Delta
(
    bronze_df.write
            .format("delta")
            .mode("overwrite")
            .saveAsTable(bronze_table)
)

print(f"Bronze table written: {TABLE_NAME}")
print("Rows:", spark.table(bronze_table).count())


Bronze table written: orders_raw
Rows: 9800


In [0]:
query = f"SELECT * FROM {CATALOG_DB}.{BRONZE_SCHEMA}.orders_raw limit 5"
spark.sql(query).show()

+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------------------+--------------+
|row_id|      order_id|order_date| ship_date|     ship_mode|customer_id|  customer_name|  segment|      country|           city|     state|postal_code|region|     product_id|       category|sub_category|        product_name|   sales|           ingest_ts|   source_file|
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------------------+--------------+
|     1|CA-2017-152156|2017-11-08|2017-11-11|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|