In [1]:
# Enable horizontal scrolling:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [25]:
# Connect to Spark on port 7077, set log level to WARN
spark_context = SparkContext(master="spark://spark:7077", appName="batch")
spark_context.setLogLevel("WARN")

spark = SparkSession(spark_context)
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [62]:
df = spark.read.json("s3a://raw-data/topics/OpenSeaRawEvents/year=*/month=*/day=*/hour=*/*.json.gz")
print("Count: ", df.count())
df

                                                                                

Count:  5475


event,payload,ref,topic
item_transferred,{item_transferred...,,collection:*
item_metadata_upd...,{item_metadata_up...,,collection:*
item_metadata_upd...,{item_metadata_up...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_metadata_upd...,{item_metadata_up...,,collection:*


In [63]:
df.describe().show()
df.printSchema()



+-------+-----------+---+------------+
|summary|      event|ref|       topic|
+-------+-----------+---+------------+
|  count|       5475| 91|        5475|
|   mean|       NULL|0.0|        NULL|
| stddev|       NULL|0.0|        NULL|
|    min|item_listed|  0|collection:*|
|    max|  phx_reply|  0|     phoenix|
+-------+-----------+---+------------+

root
 |-- event: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- event_type: string (nullable = true)
 |    |-- payload: struct (nullable = true)
 |    |    |-- base_price: string (nullable = true)
 |    |    |-- chain: string (nullable = true)
 |    |    |-- collection: struct (nullable = true)
 |    |    |    |-- slug: string (nullable = true)
 |    |    |-- event_timestamp: string (nullable = true)
 |    |    |-- expiration_date: string (nullable = true)
 |    |    |-- from_account: struct (nullable = true)
 |    |    |    |-- address: string (nullable = true)
 |    |    |-- is_private: boolean (nullable = true)

                                                                                

In [65]:
from pyspark.sql import functions as F

df_events = df.select(
    "event",
    F.col("payload.event_type").alias("event_type"),
    F.col("payload.payload.collection.slug").alias("collection_slug"),
    F.to_timestamp("payload.sent_at").alias("sent_at"),
    F.col("payload.status").alias("status"),
    F.col("payload.payload.item.metadata.name").alias("item_name"),
    F.col("payload.payload.item.metadata.image_url").alias("image_url"),
    F.col("payload.payload.item.chain.name").alias("item_blockchain"),
    F.col("payload.payload.listing_date").alias("listing_date"),
    F.col("payload.payload.listing_type").alias("listing_type"),
    F.col("payload.payload.from_account.address").alias("from_account"),
    F.col("payload.payload.to_account.address").alias("to_account"),
    F.col("payload.payload.payment_token.symbol").alias("payment_symbol"),
    F.col("payload.payload.payment_token.eth_price").alias("eth_price"),
    F.col("payload.payload.payment_token.usd_price").alias("usd_price"),
    F.col("payload.payload.quantity").alias("quantity"),
).filter(F.col("event") != "phx_reply")
df_events.show(truncate=False)

+---------------------+---------------------+-------------------------------+--------------------------+------+------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+----------------+--------------------------------+------------+------------------------------------------+------------------------------------------+--------------+-----------------+-----------------------+--------+
|event                |event_type           |collection_slug                |sent_at                   |status|item_name                                                                     |image_url                                                                            |item_blockchain |listing_date                    |listing_type|from_account                              |to_account                                |payment_symbol|eth_price        |usd_price              |quantity|
+---------------

In [64]:
df_events.select(F.max("sent_at"), F.min("sent_at")).show(truncate=False)



+-------------------------+--------------------------+
|max(sent_at)             |min(sent_at)              |
+-------------------------+--------------------------+
|2024-06-09 18:25:29.33635|2024-06-09 17:32:43.847255|
+-------------------------+--------------------------+



                                                                                

In [66]:
event_type_agg_df = df_events.groupBy("collection_slug").agg(F.count("*").alias("count")).orderBy(F.desc("count"))
event_type_agg_df.show()



+--------------------+-----+
|     collection_slug|count|
+--------------------+-----+
|          datacube-2| 2060|
|        game-item-11| 1388|
|             babes-8|  592|
|        game-item-10|  313|
|    juicyposition-13|  134|
|          datacube-1|  129|
|the-bear-and-the-...|   50|
|           scores-50|   45|
|the-bear-and-the-...|   35|
|         degencard-5|   30|
|       space-owner-6|   28|
|      shinkai-nft-15|   22|
|    bored-ape-club-3|   22|
|testing-open-tick...|   20|
|earn-m-piggybox-n...|   20|
|clober-orderbook-...|   15|
|  error-triage-pin-1|   13|
|control-structure...|   13|
|       imports-pin-1|   13|
|my-nft-collection...|   13|
+--------------------+-----+
only showing top 20 rows



                                                                                