In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.getOrCreate())

In [4]:
import pyspark.sql.functions as F

spark.read.parquet("output/kafka_0.parquet").orderBy(F.desc("timestamp")).show()

#key - partitioning key
#value - the data, in binary format. This is our JSON payload. We'll need to cast it to STRING.
#topic - the topic we are subscribing to
#partition 
#offset - the offset value. This is per topic, partition, and consumer group
#timestamp - the timestamp
#timestampType - whether timestamp is created time or log append time (by default created time)

+-------------------+--------------------+-----+---------+------+--------------------+-------------+
|                key|               value|topic|partition|offset|           timestamp|timestampType|
+-------------------+--------------------+-----+---------+------+--------------------+-------------+
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    29|2022-08-02 21:34:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    28|2022-08-02 21:34:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    30|2022-08-02 21:34:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    24|2022-08-02 21:34:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    25|2022-08-02 21:34:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    26|2022-08-02 21:34:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    27|2022-08-02 21:34:...|     

In [87]:
spark.read.parquet("output/kafka_0.parquet/*.parquet").createOrReplaceTempView("vw_kafka_0")

In [88]:
spark.sql("SELECT * FROM vw_kafka_0 ORDER BY offset desc").show()

+-------------------+--------------------+-----+---------+------+--------------------+-------------+
|                key|               value|topic|partition|offset|           timestamp|timestampType|
+-------------------+--------------------+-----+---------+------+--------------------+-------------+
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    53|2022-08-02 22:09:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    52|2022-08-02 22:09:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    51|2022-08-02 22:09:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    50|2022-08-02 22:09:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    49|2022-08-02 22:09:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    48|2022-08-02 22:09:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    47|2022-08-02 22:08:...|     

In [9]:
spark.sql("SELECT CAST(key AS STRING) key, CAST(value AS STRING) value, timestamp AS ts FROM vw_kafka_0").show(20, 50)

+------+--------------------------------------------------+-----------------------+
|   key|                                             value|                     ts|
+------+--------------------------------------------------+-----------------------+
|female|{'gender': 'female', 'name': {'title': 'Miss', ...|2022-08-02 21:34:11.319|
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...|2022-08-02 21:34:11.319|
|female|{'gender': 'female', 'name': {'title': 'Ms', 'f...|2022-08-02 21:34:11.319|
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...|2022-08-02 21:34:11.319|
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...| 2022-08-02 21:34:11.32|
|female|{'gender': 'female', 'name': {'title': 'Miss', ...| 2022-08-02 21:34:11.32|
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...| 2022-08-02 21:34:11.32|
|female|{'gender': 'female', 'name': {'title': 'Miss', ...|2022-08-02 21:06:29.574|
|female|{'gender': 'female', 'name': {'title': 'Miss', ...|2022-08-02 21:06:

In [50]:
json_schema = """
STRUCT<gender: STRING,
name: STRUCT<title: STRING,
            first: STRING,
            last: STRING>,
location: STRUCT<street: STRUCT<number: INT,
                                name: STRING>,
                 city: STRING,
                state: STRING,
                country: STRING,
                postcode: INT,
                coordinates: STRUCT<latitude: STRING,
                                    longitude: STRING>,
                timezone: STRUCT<offset: STRING,
                                description: STRING>
                >,
email: STRING,
login: STRUCT< uuid: STRING,
            username: STRING,
            password: STRING,
            salt: STRING,
            md5: STRING,
            sha1: STRING,
            sha256: STRING>,
dob: STRUCT<date: STRING,
            age: INT>,
registered: STRUCT<date: STRING,
                    age: INT>,
phone: STRING,
cell: STRING,
id: STRUCT<name: STRING,
            value: STRING>,
picture: STRUCT<large: STRING,
                medium: STRING,
                thumbnail: STRING>,
nat: STRING,
timestamp: STRING>
"""

In [92]:
spark.sql(f"""
SELECT ts, json.* FROM (
SELECT FROM_JSON(CAST(value AS STRING),'{json_schema}') AS json,
timestamp as ts
FROM vw_kafka_0 
WHERE offset > 30
)
""").show()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+------------------+--------------------+---+---------+
|                  ts|gender|                name|            location|               email|               login|                 dob|          registered|         phone|          cell|                id|             picture|nat|timestamp|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+------------------+--------------------+---+---------+
|2022-08-02 22:06:...|  male|{Mr, Pedro, Stanley}|{{7978, Stevens C...|pedro.stanley@exa...|{188e6e47-d708-4b...|{1979-02-06T07:07...|{2019-03-14T18:40...|(245) 830-6507|(921) 267-8950|{SSN, 388-20-3483}|{https://randomus...| US|     null|
|2022-08-02 22:06:...|female|{Ms, Abigai