In [0]:
# Set paths to external location (s3 bucket)
bucket = "s3://stock-pipeline-data-dev-mc"

raw_price_path = f"{bucket}/raw/prices/"
bronze_price_path = f"{bucket}/bronze/prices/"               # actual Delta storage
ckpt_price_path = f"{bucket}/_checkpoints/bronze_prices"     # streaming checkpoints
ckpt_schema_path = f"{bucket}/_checkpoints/bronze_prices_schema"  # schema evolution

catalog = "stock_pipeline"
schema  = "bronze"
table   = f"{catalog}.{schema}.prices"

In [0]:
# Auto-loader reads from raw price data
df = (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format","parquet")
        .option("cloudFiles.inferColumnTypes","true")
        .option("cloudFiles.includeExistingFiles","true")   # sweeps backfill on first run
        .option("cloudFiles.schemaLocation", ckpt_schema_path)
        .load(raw_price_path))

In [0]:
# Define Schema
from pyspark.sql.functions import col, to_date, year, month

df_norm = (df
  .withColumn("symbol", col("symbol").cast("string"))
  .withColumn("date",   to_date(col("date").cast("string")))
  .withColumn("open",   col("open").cast("double"))
  .withColumn("high",   col("high").cast("double"))
  .withColumn("low",    col("low").cast("double"))
  .withColumn("close",  col("close").cast("double"))
  .withColumn("volume", col("volume").cast("double"))
  .withColumn("year",   year(col("date")))
  .withColumn("month",  month(col("date")))
  .select("symbol","date","open","high","low","close","volume","year","month")
)

In [0]:
# write to delta table
q = (df_norm.writeStream
      .format("delta")
      .option("checkpointLocation", f"{ckpt_price_path}/stream")
      .option("mergeSchema","true")
      .partitionBy("year","month","symbol")
      .trigger(availableNow=True)          # run-to-completion, then stop
      .start(bronze_price_path)
)
q.awaitTermination()

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table}
USING DELTA
LOCATION '{bronze_price_path}'
""")

spark.sql(f"""
ALTER TABLE {table}
SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

In [0]:
%sql
SELECT * FROM stock_pipeline.bronze.prices
WHERE SYMBOL = 'AAPL'
LIMIT 10