In [0]:
# 02 Bronze Raw Ingestion

# This notebook ingests raw pharmacy data into the Bronze layer.

# - Source: Generated pharmacy dataset
# - Layer: Bronze (raw, immutable)
# - Storage format: Delta
# - No transformations applied


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *


In [0]:
# Source paths (from data generation notebook)
SOURCE_DELTA_PATH = "/Volumes/workspace/default/pharmacy_data/generated_delta"

# Bronze target path
BRONZE_DELTA_PATH = "/Volumes/workspace/default/pharmacy_data/bronze/pharmacy_events"


In [0]:
raw_df = spark.read.format("delta").load(SOURCE_DELTA_PATH)

print("Raw record count:", raw_df.count())
raw_df.show(5)


In [0]:
bronze_df = (
    raw_df
    .withColumn("ingestion_timestamp", F.current_timestamp())
    .withColumn("source_system", F.lit("synthetic_generator_v1"))
)


In [0]:
bronze_df.write.format("delta") \
    .mode("overwrite") \
    .save(BRONZE_DELTA_PATH)


In [0]:
%sql
SELECT current_catalog(), current_schema();


In [0]:
%sql
USE CATALOG workspace;


In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS workspace.pharmacy_bronze;


In [0]:
spark.read.format("delta") \
    .load("/Volumes/workspace/default/pharmacy_data/bronze/pharmacy_events") \
    .write \
    .mode("overwrite") \
    .saveAsTable("pharmacy_bronze_pharmacy_events")


In [0]:
%sql
SELECT COUNT(*) 
FROM pharmacy_bronze_pharmacy_events;


In [0]:
bronze_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze_pharmacy_events")


In [0]:
df = spark.table("bronze_pharmacy_events")

print("Bronze row count:", df.count())
df.printSchema()


In [0]:
print("Bronze ingestion completed successfully")
