In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.3.3")
         .getOrCreate()
        )


In [None]:
schema_df = (spark.read
             .parquet("output/kafka_0.parquet/*.parquet")
            )

In [6]:
schema_df.schema

StructType([StructField('key', BinaryType(), True), StructField('value', BinaryType(), True), StructField('topic', StringType(), True), StructField('partition', IntegerType(), True), StructField('offset', LongType(), True), StructField('timestamp', TimestampType(), True), StructField('timestampType', IntegerType(), True)])

In [None]:
kafka_df = (spark.readStream
     .format("parquet")
     .schema(schema_df.schema)
     .option("path", "output/kafka_0.parquet/*.parquet")
     .load()
)


In [None]:
json_schema = """
STRUCT<gender: STRING,
name: STRUCT<title: STRING,
            first: STRING,
            last: STRING>,
location: STRUCT<street: STRUCT<number: INT,
                                name: STRING>,
                 city: STRING,
                state: STRING,
                country: STRING,
                postcode: INT,
                coordinates: STRUCT<latitude: STRING,
                                    longitude: STRING>,
                timezone: STRUCT<offset: STRING,
                                description: STRING>
                >,
email: STRING,
login: STRUCT< uuid: STRING,
            username: STRING,
            password: STRING,
            salt: STRING,
            md5: STRING,
            sha1: STRING,
            sha256: STRING>,
dob: STRUCT<date: STRING,
            age: INT>,
registered: STRUCT<date: STRING,
                    age: INT>,
phone: STRING,
cell: STRING,
id: STRUCT<name: STRING,
            value: STRING>,
picture: STRUCT<large: STRING,
                medium: STRING,
                thumbnail: STRING>,
nat: STRING,
timestamp: STRING>
"""

In [None]:
import pyspark.sql.functions as F



kafka_df = (kafka_df
    .select(F.from_json(F.col("value").cast("string"), json_schema).alias("json"),
            F.col("timestamp").alias("ts"))
)

In [None]:
(kafka_df.writeStream 
    .outputMode("append") 
    .format("org.elasticsearch.spark.sql") 
    .option("checkpointLocation", "checkpoints/elastic") 
    .option("es.resource", "kafka_01") 
    .option("es.nodes", "elasticsearch") 
    .start()
    .awaitTermination()
)