In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.getOrCreate())

In [6]:
import pyspark.sql.functions as F

spark.read.parquet("output/kafka_0x.parquet/*").orderBy(F.desc("timestamp")).show(20, 40)

#key - partitioning key
#value - the data, in binary format. This is our JSON payload. We'll need to cast it to STRING.
#topic - the topic we are subscribing to
#partition 
#offset - the offset value. This is per topic, partition, and consumer group
#timestamp - the timestamp
#timestampType - whether timestamp is created time or log append time (by default created time)

+-------------------+----------------------------------------+-----+---------+------+-----------------------+-------------+
|                key|                                   value|topic|partition|offset|              timestamp|timestampType|
+-------------------+----------------------------------------+-----+---------+------+-----------------------+-------------+
|      [6D 61 6C 65]|[7B 27 67 65 6E 64 65 72 27 3A 20 27 ...|   US|        0|    59|2022-08-03 08:31:02.377|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 64 65 72 27 3A 20 27 ...|   US|        0|    61|2022-08-03 08:31:02.377|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 64 65 72 27 3A 20 27 ...|   US|        0|    65|2022-08-03 08:31:02.377|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 64 65 72 27 3A 20 27 ...|   US|        0|    62|2022-08-03 08:31:02.377|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 64 65 72 27 3A 20 27 ...|   US|        0|    60|2022-08-03 08:31:02.377|            0|
|[66 65 

In [9]:
spark.read.parquet("output/kafka_0x.parquet/*.parquet").createOrReplaceTempView("vw_kafka_0")

In [10]:
spark.sql("SELECT * FROM vw_kafka_0 ORDER BY offset desc").show()

+-------------------+--------------------+-----+---------+------+--------------------+-------------+
|                key|               value|topic|partition|offset|           timestamp|timestampType|
+-------------------+--------------------+-----+---------+------+--------------------+-------------+
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    65|2022-08-03 08:31:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    64|2022-08-03 08:31:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    63|2022-08-03 08:31:...|            0|
|[66 65 6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    62|2022-08-03 08:31:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    61|2022-08-03 08:31:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    60|2022-08-03 08:31:...|            0|
|      [6D 61 6C 65]|[7B 27 67 65 6E 6...|   US|        0|    59|2022-08-03 08:31:...|     

In [11]:
spark.sql("SELECT CAST(key AS STRING) key, CAST(value AS STRING) value, timestamp AS ts FROM vw_kafka_0").show(20, 50)

+------+--------------------------------------------------+-----------------------+
|   key|                                             value|                     ts|
+------+--------------------------------------------------+-----------------------+
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...|2022-08-03 08:31:02.377|
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...|2022-08-03 08:31:02.377|
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...|2022-08-03 08:31:02.377|
|female|{'gender': 'female', 'name': {'title': 'Miss', ...|2022-08-03 08:31:02.377|
|female|{'gender': 'female', 'name': {'title': 'Ms', 'f...|2022-08-03 08:31:02.377|
|female|{'gender': 'female', 'name': {'title': 'Mrs', '...|2022-08-03 08:31:02.377|
|female|{'gender': 'female', 'name': {'title': 'Mrs', '...|2022-08-03 08:31:02.377|
|  male|{'gender': 'male', 'name': {'title': 'Mr', 'fir...|2022-08-03 08:08:49.218|
|female|{'gender': 'female', 'name': {'title': 'Miss', ...|2022-08-03 08:08:

In [12]:
json_schema = """
STRUCT<gender: STRING,
name: STRUCT<title: STRING,
            first: STRING,
            last: STRING>,
location: STRUCT<street: STRUCT<number: INT,
                                name: STRING>,
                 city: STRING,
                state: STRING,
                country: STRING,
                postcode: INT,
                coordinates: STRUCT<latitude: STRING,
                                    longitude: STRING>,
                timezone: STRUCT<offset: STRING,
                                description: STRING>
                >,
email: STRING,
login: STRUCT< uuid: STRING,
            username: STRING,
            password: STRING,
            salt: STRING,
            md5: STRING,
            sha1: STRING,
            sha256: STRING>,
dob: STRUCT<date: STRING,
            age: INT>,
registered: STRUCT<date: STRING,
                    age: INT>,
phone: STRING,
cell: STRING,
id: STRUCT<name: STRING,
            value: STRING>,
picture: STRUCT<large: STRING,
                medium: STRING,
                thumbnail: STRING>,
nat: STRING,
timestamp: STRING>
"""

In [19]:
json_schema = """
STRUCT<gender: STRING,
email: STRING,
phone: STRING,
cell: STRING,
nat: STRING,
summerschool: INT,
timestamp: STRING>
"""

In [20]:
spark.sql(f"""
SELECT json.* FROM (
SELECT FROM_JSON(CAST(value AS STRING),'{json_schema}') AS json,
timestamp as ts
FROM vw_kafka_0
WHERE offset > 10
)
""").show()

+------+--------------------+--------------+--------------+---+------------+---------+
|gender|               email|         phone|          cell|nat|summerschool|timestamp|
+------+--------------------+--------------+--------------+---+------------+---------+
|  male|milton.johnston@e...|(774) 935-9397|(877) 594-2889| US|        null|     null|
|  male|roy.griffin@examp...|(489) 383-3270|(735) 862-4436| US|        null|     null|
|  male|guy.watkins@examp...|(237) 900-8933|(597) 376-5534| US|        null|     null|
|female|tamara.soto@examp...|(900) 850-6151|(230) 928-3213| US|        null|     null|
|female|addison.chapman@e...|(200) 497-2756|(552) 511-2887| US|        null|     null|
|female|sandra.morrison@e...|(206) 381-3465|(934) 992-5383| US|        null|     null|
|female|celina.caldwell@e...|(933) 825-3027|(630) 592-2136| US|        null|     null|
|  male|tristan.green@exa...|(609) 790-7729|(990) 622-7230| US|        null|     null|
|female|victoria.west@exa...|(966) 389-5375