In [0]:
from pyspark.sql.functions import col, split, explode, expr, monotonically_increasing_id

In [0]:
spark.readStream \
    .format("cloudFiles") \
    .option("cloudFiles.format", "text") \
    .load("/Volumes/workspace/dev/trading/") \
.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/Volumes/workspace/dev/trading/checkpoint/bronze") \
    .trigger(availableNow=True) \
    .table("workspace.default.bronze_fix_messages")

In [0]:
%sql
SELECT * FROM workspace.default.bronze_fix_messages

In [0]:
# Read from the bronze table
raw_df = (spark.readStream
          .table("workspace.default.bronze_fix_messages")
          .filter(col("value").startswith("8=")))

# Split the value column into multiple columns
parsed_df = raw_df.withColumn("BeginString", expr("split(value, '\\|')[0]"))
parsed_df = parsed_df.withColumn("MsgType", expr("split(value, '\\|')[1]"))
parsed_df = parsed_df.withColumn("SenderCompID", expr("split(value, '\\|')[2]"))
parsed_df = parsed_df.withColumn("TargetCompID", expr("split(value, '\\|')[3]"))
parsed_df = parsed_df.withColumn("MsgSeqNum", expr("split(value, '\\|')[4]"))
parsed_df = parsed_df.withColumn("SendingTime", expr("split(value, '\\|')[5]"))
parsed_df = parsed_df.withColumn("ClOrdID", expr("split(value, '\\|')[6]"))
parsed_df = parsed_df.withColumn("Symbol", expr("split(value, '\\|')[7]"))
parsed_df = parsed_df.withColumn("Side", expr("split(value, '\\|')[8]"))
parsed_df = parsed_df.withColumn("OrderQty", expr("split(value, '\\|')[9]"))
parsed_df = parsed_df.withColumn("Price", expr("split(value, '\\|')[10]"))
parsed_df = parsed_df.withColumn("CheckSum", expr("split(value, '\\|')[11]"))

# Write to the silver table
(parsed_df.writeStream
 .format("delta")
 .option("checkpointLocation", "/Volumes/workspace/dev/trading/checkpoint/silver")
 .option("mergeSchema", "true")
 .trigger(availableNow=True)
 .table("workspace.default.silver_fix_messages"))

In [0]:
%sql
SELECT * FROM workspace.default.silver_fix_messages