In [20]:
import time
import psycopg2
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, date_format, broadcast
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from elasticsearch import Elasticsearch, exceptions

# Create SparkSession
spark = SparkSession.builder \
    .appName("KafkaToElasticsearch") \
    .config("spark.sql.streaming.checkpointLocation", "s3://aws-emr-studio-381492251123-eu-central-1/stream_checkpoint/checkpoint/") \
    .getOrCreate()

# JDBC properties
jdbc_url = "jdbc:postgresql://10.0.3.216:5432/test_db"
jdbc_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

# Read static data from PostgreSQL
car_model_df = spark.read.jdbc(url=jdbc_url, table="cars", properties=jdbc_properties)

# Rename column in the PostgreSQL DataFrame to match Kafka stream DataFrame
car_model_df = car_model_df.withColumnRenamed("VIN Number", "VIN")

# Elasticsearch configuration
es_host = "10.0.3.216"
es_port = 9200  # Ensure this is an integer
es_scheme = "http"
raw_es_index = "raw_stream_data"

# Updated mappings with timestamp as string
raw_mappings = {
    "properties": {
        "timestamp": {
            "type": "text"  # Changed to text
        },
        "VIN": {
            "type": "keyword"
        },
        "ParameterName": {
            "type": "keyword"
        },
        "ParameterValue": {
            "type": "integer"
        },
        "ParameterUnit": {
            "type": "keyword"
        }
    }
}

joined_raw_mappings = {
    "properties": {
        "timestamp": {
            "type": "text"  # Changed to text
        },
        "VIN": {
            "type": "keyword"
        },
        "Car Model": {
            "type": "keyword"
        },
        "ParameterName": {
            "type": "keyword"
        },
        "ParameterValue": {
            "type": "integer"
        },
        "ParameterUnit": {
            "type": "keyword"
        }
    }
}

# Create Elasticsearch client
es = Elasticsearch([{'host': es_host, 'port': es_port, 'scheme': es_scheme}])

def create_index_if_not_exists(es_client, index_name, mappings):
    try:
        if not es_client.indices.exists(index=index_name):
            print(f"Index '{index_name}' does not exist. Creating index...")
            es_client.indices.create(
                index=index_name,
                body={
                    "mappings": mappings  # Use the provided mappings parameter
                }
            )
            print(f"Index '{index_name}' created successfully.")
        else:
            print(f"Index '{index_name}' already exists.")
    except exceptions.RequestError as e:
        print(f"RequestError: {e.info}")
    except exceptions.ConnectionError as e:
        print(f"ConnectionError: {e}")
    except Exception as e:
        print(f"Error creating index: {e}")

# Create the index in Elasticsearch
create_index_if_not_exists(es, "joined_raw_index", joined_raw_mappings)

# Define the schema
schema = StructType([
    StructField("timestamp", StringType(), True),  # Changed to StringType
    StructField("VIN", StringType(), True),
    StructField("InternalParameter", StructType([
        StructField("ParameterName", StringType(), True),
        StructField("ParameterValue", IntegerType(), True),
        StructField("ParameterUnit", StringType(), True)
    ]), True)
])

# Kafka parameters
kafka_bootstrap_servers = "b-1.kafkajoin.e1po4n.c3.kafka.eu-central-1.amazonaws.com:9092,b-2.kafkajoin.e1po4n.c3.kafka.eu-central-1.amazonaws.com:9092"
kafka_topic = "topic1"

# Create a DataFrame representing the stream of input lines from Kafka
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Convert the value column from Kafka to a string
kafka_df = kafka_df.selectExpr("CAST(value AS STRING)")

# Parse the JSON data using the schema
kafka_df = kafka_df.select(from_json(col("value"), schema).alias("data"))

# Flatten the DataFrame and convert timestamp to string format
kafka_df = kafka_df.select(
    date_format(col("data.timestamp"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").alias("timestamp"),
    col("data.VIN"),
    col("data.InternalParameter.ParameterName"),
    col("data.InternalParameter.ParameterValue"),
    col("data.InternalParameter.ParameterUnit")
)

# Broadcast the static DataFrame
broadcast_car_model_df = broadcast(car_model_df)

# Perform the join with the Kafka stream DataFrame
joined_df = kafka_df.join(broadcast_car_model_df, kafka_df.VIN == broadcast_car_model_df.VIN, "left_outer") \
    .select(
        kafka_df.timestamp,
        kafka_df.VIN,
        broadcast_car_model_df["Car Model"].alias("Car Model"),
        kafka_df.ParameterName,
        kafka_df.ParameterValue,
        kafka_df.ParameterUnit
    )

# Elasticsearch configuration for Spark
es_write_conf = {
    "es.nodes": es_host,
    "es.port": str(es_port),  # Convert port to string for Spark configuration
    "es.index.auto.create": "true"  # Ensure that Spark does not attempt to auto-create the index
}

# Write the joined data to Elasticsearch
query = joined_df.writeStream \
    .format("org.elasticsearch.spark.sql") \
    .options(**es_write_conf) \
    .option("es.resource", "joined_raw_index") \
    .outputMode("append") \
    .start()

# Await termination of the stream
query.awaitTermination()


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkStatementCancellationFailedException: Interrupted by user but Livy failed to cancel the Spark statement. The Livy session might have become unusable.

In [19]:
if es.indices.exists(index="joined_raw_index"):
    es.indices.delete(index="joined_raw_index")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

ObjectApiResponse({'acknowledged': True})