In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import *

In [2]:
# 1. Create Spark session
spark = SparkSession.builder \
    .appName("KafkaToDelta") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

In [3]:
# 2. Define the schema for the payload
payload_schema = StructType([
    StructField("ID", StringType(), False),
    StructField("SQN", IntegerType(), True),
    StructField("APPROVAL_DATETIME", StringType(), True),
    StructField("TXN_ID", StringType(), True),
    StructField("RESPONSE_CODE", IntegerType(), True),
    StructField("STATUS", IntegerType(), True),
    StructField("TXN_TYPE", IntegerType(), True),
    StructField("TXN_SUB_TYPE", IntegerType(), True),
    StructField("PROCESSING_CODE", IntegerType(), True),
    StructField("TXN_TYPE_D_C", IntegerType(), True),
    StructField("TXN_CAT", IntegerType(), True),
    StructField("CHANNEL", IntegerType(), True),
    StructField("FROM_ID", IntegerType(), True),
    StructField("TARGET_ID", IntegerType(), True),
    StructField("TXN_AMT", DoubleType(), True),
    StructField("FROM_TRUST_LEVEL", IntegerType(), True),
    StructField("TARGET_TRUST_LEVEL", IntegerType(), True),
    StructField("FROM_ACCOUNT_TYPE", IntegerType(), True),
    StructField("TARGET_ACCOUNT_TYPE", IntegerType(), True),
    StructField("pk_id", IntegerType(), False),
    StructField("__deleted", StringType(), True)
])

In [4]:
# 3. Define full envelope schema for Debezium message
envelope_schema = StructType([
    StructField("schema", StructType(), True),  
    StructField("payload", payload_schema)      # actual data
])

In [5]:
# 4. Read from Kafka topic as streaming source
raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:29092") \
    .option("subscribe", "postgres_cdc_.public.kafka_sink_data") \
    .option("startingOffsets", "earliest") \
    .load()

In [6]:
# 5. Extract and parse the JSON string from Kafka 'value'
json_df = raw_df.selectExpr("CAST(value AS STRING) as json_str")

In [7]:
# 6. Parse the JSON and extract only the payload part
payload_df = json_df \
    .withColumn("data", from_json(col("json_str"), envelope_schema)) \
    .select("data.payload.*")

In [8]:
# 7. Write parsed payload to Delta (Bronze layer)
query = payload_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/delta/checkpoints/kafka_sink_data") \
    .start("/delta/bronze/kafka_sink_data")

In [9]:
from delta.tables import DeltaTable
# Read the bronze Delta table
bronze_df = spark.read.format("delta").load("/delta/bronze/kafka_sink_data")

In [10]:
# Show what got written
bronze_df.show(20, truncate=False)

+----------------+---+----------------------------+--------+-------------+------+--------+------------+---------------+------------+-------+-------+--------+---------+-------+----------------+------------------+-----------------+-------------------+-------+---------+
|ID              |SQN|APPROVAL_DATETIME           |TXN_ID  |RESPONSE_CODE|STATUS|TXN_TYPE|TXN_SUB_TYPE|PROCESSING_CODE|TXN_TYPE_D_C|TXN_CAT|CHANNEL|FROM_ID |TARGET_ID|TXN_AMT|FROM_TRUST_LEVEL|TARGET_TRUST_LEVEL|FROM_ACCOUNT_TYPE|TARGET_ACCOUNT_TYPE|pk_id  |__deleted|
+----------------+---+----------------------------+--------+-------------+------+--------+------------+---------------+------------+-------+-------+--------+---------+-------+----------------+------------------+-----------------+-------------------+-------+---------+
|2505015300000001|1  |01-MAY-25 12.23.52.564571 AM|E6F889FC|0            |1     |53      |null        |530085         |1           |91     |81     |13285611|null     |16.25  |4               |null