In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

In [None]:
# add the actual credintainal an end point 
# Step 1: Build the Spark Session with required Kafka & S3 packages
spark = SparkSession.builder \
    .appName("KafkaToS3Parquet") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,"
            "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.access.key", "YOUR_AWS_ACCESS_KEY") \
    .config("spark.hadoop.fs.s3a.secret.key", "YOUR_AWS_SECRET_KEY") \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "true") \
    .config("spark.sql.parquet.compression.codec", "snappy") \
    .getOrCreate()

In [None]:
# Step 2: Read from Kafka
kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "transactions") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()

In [None]:
# Step 3: Define your schema (matches incoming Kafka JSON structure)
schema = StructType() \
    .add("id", StringType()) \
    .add("amount", DoubleType()) \
    .add("event_time", TimestampType())

In [None]:
# Step 4: Parse the Kafka value as JSON
parsed = kafka_stream.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json("json_str", schema).alias("data")) \
    .select("data.*")

In [None]:
# Step 5: Write to S3 as Parquet
sink = parsed.writeStream \
    .format("parquet") \
    .option("path", "s3a://my-bucket/raw/transactions_parquet/") \
    .option("checkpointLocation", "s3a://my-bucket/checkpoints/kafka_to_parquet/") \
    .outputMode("append") \
    .trigger(processingTime="30 seconds") \
    .start()

sink.awaitTermination()
