In [0]:
# Set paths to external location (s3 bucket)
bucket = "s3://stock-pipeline-data-dev-mc"

raw_price_path = f"{bucket}/raw/prices/"
bronze_price_path = f"{bucket}/bronze/prices_v2/"               # actual Delta storage
ckpt_price_path = f"{bucket}/_checkpoints/bronze_prices"     # streaming checkpoints
ckpt_schema_path = f"{bucket}/_checkpoints/bronze_prices_schema"  # schema evolution

catalog = "stock_pipeline"
schema  = "bronze"
table   = f"{catalog}.{schema}.prices_v2"

In [0]:
# Auto-loader reads from raw price data
df = (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format","parquet")
        .option("ignoreMissingFiles", "true")
        .option("cloudFiles.inferColumnTypes","true")
        .option("cloudFiles.includeExistingFiles","false")   # use false for daily ingest
        .option("cloudFiles.schemaLocation", ckpt_schema_path)
        .load(raw_price_path))

In [0]:
# Define Schema
from pyspark.sql.functions import col, to_date, year, month

df_norm = (df
    .withColumn("symbol", col("symbol").cast("string"))
    .withColumn("as_of_date", to_date(col("as_of_date")))  # New column name
    .withColumn("open", col("open").cast("double"))
    .withColumn("high", col("high").cast("double"))
    .withColumn("low", col("low").cast("double"))
    .withColumn("close", col("close").cast("double"))
    .withColumn("volume", col("volume").cast("double"))
    .select(
      "symbol", "as_of_date", "open", "high", "low", "close", "volume",
      "fetched_at", "source", "endpoint", "request_id", "file_hash"  # Keep metadata
    )
  )

In [0]:
# write to delta table
q = (df_norm.writeStream
      .format("delta")
      .outputMode("append")
      .option("checkpointLocation", f"{ckpt_price_path}/stream")
      .option("mergeSchema","true")
      .option("optimizeWrite", "true")  # ← Add: Bins small files
      .option("autoCompact", "true")     # ← Add: Compacts automatically
      .trigger(availableNow=True)  # run-to-completion, then stop
      .start(bronze_price_path)
)

q.awaitTermination()

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {table}
    USING DELTA
    LOCATION '{bronze_price_path}'
""")

spark.sql(f"""
    ALTER TABLE {table}
    SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

spark.sql(f"""
    ALTER TABLE {table}
    SET TBLPROPERTIES ('delta.columnMapping.mode' = 'name')
""")

spark.sql(
    f"""
    OPTIMIZE delta.`{bronze_price_path}`
    ZORDER BY (as_of_date, symbol)
    """
)