In [0]:
from pyspark.sql.functions import col, split, explode, expr, monotonically_increasing_id

In [0]:
spark.readStream \
    .format("cloudFiles") \
    .option("cloudFiles.format", "text") \
    .load("/Volumes/workspace/dev/trading/") \
.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/Volumes/workspace/dev/trading/checkpoint/bronze") \
    .trigger(availableNow=True) \
    .table("workspace.default.bronze_fix_messages")

In [0]:
%sql
SELECT * FROM workspace.default.bronze_fix_messages

In [0]:
# Read from the bronze table
raw_df = (spark.readStream
          .table("workspace.default.bronze_fix_messages")
          .filter(col("value").startswith("8=")))

# Dictionnaire des tags à extraire
tags = {
    "BeginString": "8",
    "MsgType": "35",
    "SenderCompID": "49",
    "TargetCompID": "56",
    "MsgSeqNum": "34",
    "SendingTime": "52",
    "ClOrdID": "11",
    "Symbol": "55",
    "Side": "54",
    "OrderQty": "38",
    "Price": "44",
    "CheckSum": "10"
}

# Créer une colonne temporaire en tableau clé-valeur
kv_df = raw_df.withColumn("fields", split(col("value"), "\\|"))

# Pour chaque tag, extraire la valeur associée
for col_name, tag in tags.items():
    kv_df = kv_df.withColumn(
        col_name,
        expr(f"""
            filter(fields, x -> x like '{tag}=%')[0]
        """)
    ).withColumn(
        col_name,
        expr(f"split({col_name}, '=')[1]")
    )

parsed_df = kv_df.select("value", *tags.keys())

# Write to the silver table
(parsed_df.writeStream
 .format("delta")
 .option("checkpointLocation", "/Volumes/workspace/dev/trading/checkpoint/silver")
 .option("mergeSchema", "true")
 .trigger(availableNow=True)
 .outputMode("append")
 .table("workspace.default.silver_fix_messages"))

In [0]:
%sql
SELECT * FROM workspace.default.silver_fix_messages