In [3]:
from pyspark.sql import SparkSession

import pyspark.sql.functions as F

spark = (
    SparkSession.builder.appName("Streaming")  # type: ignore
    .master("spark://spark:7077")
    .config("spark.executor.cores", 1)
    .config("spark.executor.instances", 1)
    # .config("spark.cores.max", 1)
    # Set configuration for Notebook display
    .config("spark.sql.repl.eagerEval.enabled", True)
    # Log level to WARN to avoid huge logs
    .config("spark.logConf", False)
    .getOrCreate()
)
# spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.sparkContext.setLogLevel("WARN")
spark

In [4]:
# Read data from the topic as a DataFrame.
raw_topic_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("subscribe", "OpenSeaRawEvents")
    .load()
)
raw_topic_df.printSchema()
parsed_topic_data = raw_topic_df.selectExpr(
    "CAST(value as string) as json_value",
    "timestamp as processed_at",
)
opensea_events = parsed_topic_data.select(
    F.get_json_object("json_value", "$.payload.event_type.slug").alias("event_type"),
    F.get_json_object("json_value", "$.payload.payload.collection.slug").alias("collection_slug"),
    F.to_timestamp(F.get_json_object("json_value", "$.payload.sent_at")).alias("sent_at"),
    F.get_json_object("json_value", "$.payload.status").alias("status"),
    F.get_json_object("json_value", "$.payload.payload.item.metadata.name").alias("item_name"),
    F.get_json_object("json_value", "$.payload.payload.item.permalink").alias("item_url"),
    F.get_json_object("json_value", "$.payload.payload.item.metadata.image_url").alias("image_url"),
    F.get_json_object("json_value", "$.payload.payload.item.chain.name").alias("item_blockchain"),
    F.get_json_object("json_value", "$.payload.payload.listing_date").alias("listing_date"),
    F.get_json_object("json_value", "$.payload.payload.listing_type").alias("listing_type"),
    F.get_json_object("json_value", "$.payload.payload.from_account.address").alias("from_account"),
    F.get_json_object("json_value", "$.payload.payload.to_account.address").alias("to_account"),
    F.get_json_object("json_value", "$.payload.payload.payment_token.symbol").alias("payment_symbol"),
    F.get_json_object("json_value", "$.payload.payload.payment_token.eth_price").alias("eth_price"),
    F.get_json_object("json_value", "$.payload.payload.payment_token.usd_price").alias("usd_price"),
    F.get_json_object("json_value", "$.payload.payload.quantity").alias("quantity"),
)
opensea_events.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

root
 |-- event_type: string (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- sent_at: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_url: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- item_blockchain: string (nullable = true)
 |-- listing_date: string (nullable = true)
 |-- listing_type: string (nullable = true)
 |-- from_account: string (nullable = true)
 |-- to_account: string (nullable = true)
 |-- payment_symbol: string (nullable = true)
 |-- eth_price: string (nullable = true)
 |-- usd_price: string (nullable = true)
 |-- quantity: string (nullable = true)



In [5]:
sold_items = opensea_events.filter(F.col("event_type") == "item_sold")
sold_items.printSchema()

root
 |-- event_type: string (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- sent_at: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_url: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- item_blockchain: string (nullable = true)
 |-- listing_date: string (nullable = true)
 |-- listing_type: string (nullable = true)
 |-- from_account: string (nullable = true)
 |-- to_account: string (nullable = true)
 |-- payment_symbol: string (nullable = true)
 |-- eth_price: string (nullable = true)
 |-- usd_price: string (nullable = true)
 |-- quantity: string (nullable = true)



In [6]:
transferred_items = opensea_events.withColumn(
    "quantity",
    F.when(F.col("quantity").cast("int") > 0, F.col("quantity").cast("int")).otherwise(
        0
    ),
)
transferred_items.printSchema()

root
 |-- event_type: string (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- sent_at: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_url: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- item_blockchain: string (nullable = true)
 |-- listing_date: string (nullable = true)
 |-- listing_type: string (nullable = true)
 |-- from_account: string (nullable = true)
 |-- to_account: string (nullable = true)
 |-- payment_symbol: string (nullable = true)
 |-- eth_price: string (nullable = true)
 |-- usd_price: string (nullable = true)
 |-- quantity: integer (nullable = true)



In [7]:
agg_time_frame = "1 minute" # The time for each aggregation
time_frame_txt = "_".join(agg_time_frame.split())
agg_update_time = "30 seconds" # The time for each update of the aggregation
# Every <agg_update_time> we will calculate the aggregation for the last <agg_time_frame>

## Global Metrics

In [8]:
kafka_topic = "OpenSeaEnrichedGlobalEvents"
topic_checkpoint_folder = f"s3a://processed-data/checkpoints/topics/{kafka_topic}"

### Transactions metrics

In [9]:
query_checkpoint_folder = f"{topic_checkpoint_folder}/MarketTransactionsMetrics/"

In [10]:
agg_time_window = F.window("sent_at", agg_time_frame, agg_update_time)

In [11]:
windowed_transactions = (
    transferred_items
    .withWatermark("sent_at", agg_update_time)
    .groupBy(agg_time_window)
    .agg(
        F.count("*").alias("transfers_count"),
        F.sum("quantity").alias("items_transferred_count"),
    )
    .select(
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        "transfers_count",
        "items_transferred_count",
    )
)
windowed_transactions.printSchema()

root
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- transfers_count: long (nullable = false)
 |-- items_transferred_count: long (nullable = true)



In [12]:
windowed_transactions_events = windowed_transactions.unpivot(
    ["window_start", "window_end"],
    ["transfers_count", "items_transferred_count"],
    "metric",
    "value",
).select(
    F.concat("metric", F.lit(f"__{time_frame_txt}")).alias("metric"),
    F.col("window_end").alias("timestamp"),
    "value",
    F.lit(None).alias("collection"),
)
windowed_transactions_events.printSchema()

root
 |-- metric: string (nullable = false)
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- collection: void (nullable = true)



In [13]:
transactions_metrics_messages = windowed_transactions_events.select(
    F.to_json(F.struct("*"), options={"ignoreNullFields": False}).alias("value")
)

In [14]:
# Send events to Kafka Topic
transactions_metrics_query = (
    transactions_metrics_messages.writeStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option(
        "checkpointLocation",
        query_checkpoint_folder,
    )
    .option("topic", kafka_topic)
)
transactions_metrics_query.start()

24/07/21 09:49:58 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/07/21 09:49:59 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0xffff74142e10>

In [15]:
# transactions_metrics_query_console = (
#     transactions_metrics_messages.writeStream.outputMode("append")
#     .format("console")
#     .option("truncate", False)
#     .start()
# )

### Marketplace Data

In [16]:
query_checkpoint_folder = f"{topic_checkpoint_folder}/MarketDataAggregations/"

In [17]:
agg_time_window = F.window("sent_at", agg_time_frame, agg_update_time)

In [18]:
sold_items_windowed = (
    sold_items.withWatermark("sent_at", agg_update_time)
    .groupBy(agg_time_window)
    .agg(
        F.sum("usd_price").alias("usd_volume"),
        F.count("*").alias("sales_count"),
    )
    .select(
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        "usd_volume",
        "sales_count",
    )
)
sold_items_windowed.printSchema()

root
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- usd_volume: double (nullable = true)
 |-- sales_count: long (nullable = false)



In [19]:
sold_items_windowed_events = sold_items_windowed.select(
    F.lit(f"total_volume__{time_frame_txt}").alias("metric"),
    F.col("window_end").alias("timestamp"),
    F.col("usd_volume").alias("value"),
    F.lit(None).alias("collection"),
)
sold_items_windowed_events.printSchema()

root
 |-- metric: string (nullable = false)
 |-- timestamp: timestamp (nullable = true)
 |-- value: double (nullable = true)
 |-- collection: void (nullable = true)



In [20]:
# Send events to Kafka Topic
sales_volume_query = (
    sold_items_windowed_events.select(F.to_json(F.struct("*")).alias("value"))
    .writeStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option(
        "checkpointLocation",
        query_checkpoint_folder,
    )
    .option("topic", kafka_topic)
)
sales_volume_query.start()

24/07/21 09:50:00 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0xffff7c3f2a10>

## Top Collections

In [21]:
query_checkpoint_folder = f"{topic_checkpoint_folder}/MarketTopCollections/"

In [22]:
agg_time_window = F.window("sent_at", agg_time_frame, agg_update_time)
agg_time_window

Column<'window(sent_at, 60000000, 30000000, 0) AS window'>

In [23]:
sold_items.groupBy("collection_slug", agg_time_window).agg(
    F.sum("usd_price").alias("usd_volume"),
    F.count("*").alias("sales_count"),
).printSchema()

root
 |-- collection_slug: string (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- usd_volume: double (nullable = true)
 |-- sales_count: long (nullable = false)



In [24]:

top_collections_windowed = (
    sold_items.withWatermark("sent_at", agg_update_time)
    .groupBy("collection_slug", agg_time_window)
    .agg(
        F.sum("usd_price").alias("usd_volume"),
        F.count("*").alias("sales_count"),
    )
    .select(
        "collection_slug",
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        "usd_volume",
        "sales_count",
    )
)
top_collections_windowed.printSchema()

root
 |-- collection_slug: string (nullable = true)
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- usd_volume: double (nullable = true)
 |-- sales_count: long (nullable = false)



In [25]:
top_collections_windowed_events = top_collections_windowed.select(
    F.lit(f"top_collections_by_volume__{time_frame_txt}").alias("metric"),
    F.col("window_end").alias("timestamp"),
    F.col("usd_volume").alias("value"),
    F.col("collection_slug").alias("collection"),
)
top_collections_windowed_events.printSchema()

root
 |-- metric: string (nullable = false)
 |-- timestamp: timestamp (nullable = true)
 |-- value: double (nullable = true)
 |-- collection: string (nullable = true)



In [None]:
# Send events to Kafka Topic
top_collections_query = (
    top_collections_windowed_events.select(F.to_json(F.struct("*")).alias("value"))
    .writeStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option(
        "checkpointLocation",
        query_checkpoint_folder,
    )
    .option("topic", kafka_topic)
)
top_collections_query.start()

24/07/21 09:50:01 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0xffff740ff150>

24/07/21 09:50:33 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/07/21 09:51:10 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/07/21 09:51:31 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/07/21 10:27:15 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 1 (kafka/172.18.0.10:19092) could not be established. Broker may not be available.
24/07/21 10:27:15 WARN NetworkClient: [AdminClient clientId=adminclient-2] Connection to node 1 (kafka/172.18.0.10:19092) could not be established. Broker may not be available.
24/07/21 10:27:15 WARN NetworkClient: [AdminClient clie

In [24]:
# For debugging:
top_collections_query_console = (
    top_collections_windowed_events.select(F.to_json(F.struct("*")).alias("value"))
    .writeStream.format("console")
    .option("truncate", False)
)
top_collections_query_console.start()



24/07/20 19:08:09 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-62dfe34c-5490-4f92-acf0-690ed3b14b17. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/07/20 19:08:09 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/07/20 19:08:10 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.

<pyspark.sql.streaming.query.StreamingQuery at 0xffff94264c90>

In [25]:
# Read data from the topic as a DataFrame.
raw_topic_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("subscribe", "OpenSeaRawEvents")
    .load()
)
raw_topic_df.printSchema()
parsed_df = raw_topic_df.selectExpr(
    "CAST(value as string) as json_value",
    "timestamp as processed_at",
)
nft_data = parsed_df.selectExpr(
    "get_json_object(json_value, '$.payload.payload.collection.slug') as collection_slug",
    "to_timestamp(get_json_object(json_value, '$.payload.sent_at')) as sent_at",
)
agg_df = (
    nft_data.withWatermark(
        "sent_at", "30 seconds"
    )  # Define watermark to handle late data. ie, Data that arrives after 30 seconds of the event time will be ignored
    .groupBy(
        F.window(
            # Every 30 seconds calculate the count of events in the last 1 minute
            "sent_at",
            "1 minute",
            "30 seconds",
        ),
        "collection_slug",
    )
    .count()
)
agg_df = agg_df.select(
    F.col("window.start").alias("window_start"),
    F.col("window.end").alias("window_end"),
    F.lit("")
    "collection_slug",
    "count",
)
agg_df.printSchema()
# query = agg_df.writeStream.format("console").outputMode("complete")
# query.start().awaitTermination()

SyntaxError: invalid syntax. Perhaps you forgot a comma? (465368463.py, line 35)

24/07/20 19:08:10 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------


[Stage 16:>                 (0 + 1) / 1][Stage 18:>                 (0 + 0) / 1]

+-----+
|value|
+-----+
+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+
|value|
+-----+
+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+
|value|
+-----+
+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+-----+
|value|
+-----+
+-----+



24/07/20 19:18:27 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: org.apache.spark.sql.kafka010.KafkaStreamingWrite@1574cd1a] is aborting.
24/07/20 19:18:27 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: org.apache.spark.sql.kafka010.KafkaStreamingWrite@1574cd1a] aborted.
24/07/20 19:18:27 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: ConsoleWriter[numRows=20, truncate=false]] is aborting.
24/07/20 19:18:27 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 55, writer: org.apache.spark.sql.kafka010.KafkaStreamingWrite@2ac72f2] is aborting.
24/07/20 19:18:27 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 55, writer: org.apache.spark.sql.kafka010.KafkaStreamingWrite@2ac72f2] aborted.
24/07/20 19:18:27 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: org.ap

In [None]:
# Debugging the query
quert = agg_df.writeStream.format("console").outputMode("complete").start()

In [None]:
import os
# Start the Structured Streaming query to Kafka
query = agg_df.select(F.to_json(F.struct("*")).alias("value"))
query.printSchema()
query = (
    query
    .writeStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option(
        "checkpointLocation",
        "s3a://processed-data/checkpoints/topics/OpenSeaEnrichedEvents/",
    )
    .option("topic", "OpenSeaEnrichedEvents")
)
query.start().awaitTermination()