In [0]:
from pyspark.sql import functions as F
import re

CATALOG_DB = "sales_db"
BRONZE_SCHEMA = "bronze"
TABLE_NAME = "orders_raw"
BRONZE_INPUT_PATH = "abfss://{bronze_container}@{storage_account}.dfs.core.windows.net/{input_file}"  

bronze_table = f"{CATALOG_DB}.{BRONZE_SCHEMA}.{TABLE_NAME}"

In [0]:
# create Database and schema
spark.sql(f"CREATE DATABASE IF NOT EXISTS {CATALOG_DB}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_DB}.{BRONZE_SCHEMA}")

DataFrame[]

In [0]:
# Read CSV
df = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .option("multiLine", "true")
         .option("escape", "\"")
         .option("quote", "\"")
         .csv(BRONZE_INPUT_PATH)
)

display(df)

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
1,CA-2017-152156,2017-11-08,2017-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
2,CA-2017-152156,2017-11-08,2017-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94
3,CA-2017-138688,2017-06-12,2017-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal,14.62
4,US-2016-108966,2016-10-11,2016-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
5,US-2016-108966,2016-10-11,2016-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368
6,CA-2015-115812,2015-06-09,2015-06-14,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,FUR-FU-10001487,Furniture,Furnishings,"Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood",48.86
7,CA-2015-115812,2015-06-09,2015-06-14,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,OFF-AR-10002833,Office Supplies,Art,Newell 322,7.28
8,CA-2015-115812,2015-06-09,2015-06-14,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,TEC-PH-10002275,Technology,Phones,Mitel 5320 IP Phone VoIP phone,907.152
9,CA-2015-115812,2015-06-09,2015-06-14,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,OFF-BI-10003910,Office Supplies,Binders,DXL Angle-View Binders with Locking Rings by Samsill,18.504
10,CA-2015-115812,2015-06-09,2015-06-14,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,OFF-AP-10002892,Office Supplies,Appliances,Belkin F5C206VTEL 6 Outlet Surge,114.9


In [0]:
# Sanitize column names 
def sanitize_name(c: str) -> str:
    c2 = c.strip()
    c2 = re.sub(r"[ ,;{}()\n\t=]", "_", c2)  
    c2 = re.sub(r"[^0-9a-zA-Z_]", "_", c2)   
    c2 = re.sub(r"_+", "_", c2)              
    c2 = c2.strip("_").lower()
    return c2

sanitized_cols = [sanitize_name(c) for c in df.columns]
df = df.toDF(*sanitized_cols)


# Add ingestion metadata 
bronze_df = (
    df.withColumn("ingest_ts", F.current_timestamp())
      .withColumn("source_file", F.lit("Sales_data.csv"))
)

In [0]:
# Write to Delta
(
    bronze_df.write
            .format("delta")
            .mode("overwrite")
            .saveAsTable(bronze_table)
)

print(f"Bronze table written: {TABLE_NAME}")
print("Rows:", spark.table(bronze_table).count())


Bronze table written: orders_raw
Rows: 9800


In [0]:
query = f"SELECT * FROM {CATALOG_DB}.{BRONZE_SCHEMA}.orders_raw limit 5"
spark.sql(query).show()

+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------------------+--------------+
|row_id|      order_id|order_date| ship_date|     ship_mode|customer_id|  customer_name|  segment|      country|           city|     state|postal_code|region|     product_id|       category|sub_category|        product_name|   sales|           ingest_ts|   source_file|
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------------------+--------------+
|     1|CA-2017-152156|2017-11-08|2017-11-11|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|