In [1]:
# Enable horizontal scrolling:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("Processing").getOrCreate()

In [3]:
df = spark.read.json("../messages/*.jsonl")
df.show()

+--------------------+--------------------+----+------------+
|               event|             payload| ref|       topic|
+--------------------+--------------------+----+------------+
|           phx_reply|{NULL, NULL, NULL...|   0|collection:*|
|           phx_reply|{NULL, NULL, NULL...|   0|     phoenix|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|    ite

In [4]:
df.describe().show()

+-------+--------------------+---+------------+
|summary|               event|ref|       topic|
+-------+--------------------+---+------------+
|  count|                 986| 45|         986|
|   mean|                NULL|0.0|        NULL|
| stddev|                NULL|0.0|        NULL|
|    min|item_metadata_upd...|  0|collection:*|
|    max|           phx_reply|  0|     phoenix|
+-------+--------------------+---+------------+



In [9]:
df_events = df.select(
    "event",
    F.col("payload.payload.collection.slug").alias("collection_slug"),
    F.col("payload.sent_at").alias("sent_at"),
    F.to_timestamp("payload.sent_at").alias("sent_at_ts"),
).filter(F.col("event")!="phx_reply")
df_events.show(truncate = False)

+---------------------+--------------------------------+--------------------------------+--------------------------+
|event                |collection_slug                 |sent_at                         |sent_at_ts                |
+---------------------+--------------------------------+--------------------------------+--------------------------+
|item_transferred     |lll-12                          |2023-10-07T20:09:17.901084+00:00|2023-10-07 20:09:17.901084|
|item_metadata_updated|lll-12                          |2023-10-07T20:09:17.955958+00:00|2023-10-07 20:09:17.955958|
|item_transferred     |unidentified-contract-syi4yutfpm|2023-10-07T20:09:19.861155+00:00|2023-10-07 20:09:19.861155|
|item_metadata_updated|lll-12                          |2023-10-07T20:09:17.966948+00:00|2023-10-07 20:09:17.966948|
|item_transferred     |lll-12                          |2023-10-07T20:09:17.907710+00:00|2023-10-07 20:09:17.90771 |
|item_metadata_updated|lll-12                          |2023-10-

In [6]:
event_type_agg_df = df_events.groupBy("collection_slug").agg(F.count("*").alias("count")).orderBy(F.desc("count"))
event_type_agg_df.show()

+--------------------+-----+
|     collection_slug|count|
+--------------------+-----+
|flexiblereturntok...|  510|
|              lll-12|  204|
|     vera-uj9hinocnp|   24|
| gnomes-collective-2|   20|
|             bird-22|   14|
|      gabbyexplore-1|   11|
|unidentified-cont...|   10|
|   gabbyexperience-2|    8|
|   demask-creator-19|    7|
|               npg-3|    7|
|badgetoken-eo1x1i...|    6|
|  shiba-crypto-nft-2|    6|
|         home-boys-2|    5|
|sleep-number-chal...|    5|
|collective-finance-3|    5|
|jackrabbit-jackal...|    4|
|unidentified-cont...|    4|
|unidentified-cont...|    4|
|      art-entr-elles|    4|
|       gabbyprompt-3|    4|
+--------------------+-----+
only showing top 20 rows

