In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, when
from pyspark.sql.types import StructType, StructField, StringType

# Define the schema for the incoming JSON data
schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("VIN", StringType(), True),
    StructField("InternalParameter", StructType([
        StructField("ParameterName", StringType(), True),
        StructField("ParameterValue", StringType(), True),
        StructField("ParameterUnit", StringType(), True)
    ]), True)
])

# Create SparkSession
spark = SparkSession.builder \
    .appName("KafkaToElasticsearch") \
    .config("spark.sql.streaming.checkpointLocation", "s3://aws-emr-studio-381492251123-eu-central-1/stream_checkpoint/checkpoint/") \
    .getOrCreate()

# Kafka parameters
kafka_bootstrap_servers = "b-2.kafkacluster8.7p0gh6.c3.kafka.eu-central-1.amazonaws.com:9092,b-1.kafkacluster8.7p0gh6.c3.kafka.eu-central-1.amazonaws.com:9092"
kafka_topic = "topic1"

# Create a DataFrame representing the stream of input lines from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Convert the value column from Kafka to a string
df = df.selectExpr("CAST(value AS STRING)")

# Parse the JSON data using the schema
df = df.select(from_json(col("value"), schema).alias("data"))

# Flatten the data
df = df.select(
    col("data.timestamp"),
    col("data.VIN"),
    col("data.InternalParameter.ParameterName").alias("InternalParameter_ParameterName"),
    col("data.InternalParameter.ParameterValue").alias("InternalParameter_ParameterValue"),
    col("data.InternalParameter.ParameterUnit").alias("InternalParameter_ParameterUnit")
)

# Handle missing values
df = df.fillna({
    'InternalParameter_ParameterName': '',
    'InternalParameter_ParameterValue': '',
    'InternalParameter_ParameterUnit': ''
})

# Add a query for writing to a CSV file (for persistent logging)
file_query = df.writeStream \
    .outputMode("append") \
    .format("csv") \
    .option("path", "s3://aws-emr-studio-381492251123-eu-central-1/stream_output/") \
    .option("checkpointLocation", "s3://aws-emr-studio-381492251123-eu-central-1/stream_checkpoint/checkpoint/") \
    .option("header", "true") \
    .option("sep", ",") \
    .start()

# Await termination of the stream
file_query.awaitTermination()


VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1723457178906_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkStatementCancellationFailedException: Interrupted by user but Livy failed to cancel the Spark statement. The Livy session might have become unusable.