In [1]:
# Enable horizontal scrolling:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark import SparkContext, HiveContext
from pyspark.sql import SparkSession, SQLContext

In [3]:
# Connect to Spark on port 7077
spark = (
    SparkSession.builder
    .master("spark://spark:7077")
    .appName("batch")
    .getOrCreate()
)

In [4]:
sqlContext = SQLContext(spark)
spark.sparkContext.getConf().getAll()



[('spark.app.startTime', '1700167368726'),
 ('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false'),
 ('spark.hadoop.fs.s3a.access.key', 'minio'),
 ('spark.hadoop.fs.s3a.path.style.access', 'true'),
 ('spark.master', 'spark://spark:7077'),
 ('spark.driver.host', '49273155dcb3'),
 ('spark.jars',
  'file:/home/jovyan/jars/hadoop-aws-3.3.2.jar,file:/home/jovyan/jars/aws-java-sdk-1.12.367.jar,file:/home/jovyan/jars/s3-2.18.41.jar,file:/home/jovyan/jars/aws-java-sdk-bundle-1.11.1026.jar'),
 ('spark.driver.port', '44943'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'app-20231116204250-0002'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=ja

In [13]:
df = spark.read.json("s3a://raw-messages/*.jsonl")
print("Count: ", df.count())
df.show()

Count:  986
+--------------------+--------------------+----+------------+
|               event|             payload| ref|       topic|
+--------------------+--------------------+----+------------+
|           phx_reply|{NULL, NULL, NULL...|   0|collection:*|
|           phx_reply|{NULL, NULL, NULL...|   0|     phoenix|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|    item_transferred|{item_transferred...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection:*|
|item_metadata_upd...|{item_metadata_up...|NULL|collection

In [14]:
df.describe().show()
df.printSchema()

+-------+--------------------+---+------------+
|summary|               event|ref|       topic|
+-------+--------------------+---+------------+
|  count|                 986| 45|         986|
|   mean|                NULL|0.0|        NULL|
| stddev|                NULL|0.0|        NULL|
|    min|item_metadata_upd...|  0|collection:*|
|    max|           phx_reply|  0|     phoenix|
+-------+--------------------+---+------------+

root
 |-- event: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- event_type: string (nullable = true)
 |    |-- payload: struct (nullable = true)
 |    |    |-- chain: string (nullable = true)
 |    |    |-- collection: struct (nullable = true)
 |    |    |    |-- slug: string (nullable = true)
 |    |    |-- event_timestamp: string (nullable = true)
 |    |    |-- from_account: struct (nullable = true)
 |    |    |    |-- address: string (nullable = true)
 |    |    |-- item: struct (nullable = true)
 |    |    |    |-- chain: struct 

In [11]:
from pyspark.sql import functions as F

df_events = df.select(
    "event",
    F.col("payload.event_type").alias("event_type"),
    F.col("payload.payload.collection.slug").alias("collection_slug"),
    F.col("payload.sent_at").alias("sent_at"),
    F.to_timestamp("payload.sent_at").alias("sent_at_ts"),
).filter(F.col("event")!="phx_reply")
df_events.show(truncate = False)

+---------------------+---------------------+--------------------------------+--------------------------------+--------------------------+
|event                |event_type           |collection_slug                 |sent_at                         |sent_at_ts                |
+---------------------+---------------------+--------------------------------+--------------------------------+--------------------------+
|item_transferred     |item_transferred     |lll-12                          |2023-10-07T20:09:17.901084+00:00|2023-10-07 20:09:17.901084|
|item_metadata_updated|item_metadata_updated|lll-12                          |2023-10-07T20:09:17.955958+00:00|2023-10-07 20:09:17.955958|
|item_transferred     |item_transferred     |unidentified-contract-syi4yutfpm|2023-10-07T20:09:19.861155+00:00|2023-10-07 20:09:19.861155|
|item_metadata_updated|item_metadata_updated|lll-12                          |2023-10-07T20:09:17.966948+00:00|2023-10-07 20:09:17.966948|
|item_transferred     |item

In [12]:
event_type_agg_df = df_events.groupBy("collection_slug").agg(F.count("*").alias("count")).orderBy(F.desc("count"))
event_type_agg_df.show()

+--------------------+-----+
|     collection_slug|count|
+--------------------+-----+
|flexiblereturntok...|  510|
|              lll-12|  204|
|     vera-uj9hinocnp|   24|
| gnomes-collective-2|   20|
|             bird-22|   14|
|      gabbyexplore-1|   11|
|unidentified-cont...|   10|
|   gabbyexperience-2|    8|
|               npg-3|    7|
|   demask-creator-19|    7|
|badgetoken-eo1x1i...|    6|
|  shiba-crypto-nft-2|    6|
|         home-boys-2|    5|
|sleep-number-chal...|    5|
|collective-finance-3|    5|
|unidentified-cont...|    4|
|jackrabbit-jackal...|    4|
|unidentified-cont...|    4|
|       gabbyprompt-3|    4|
|        market-nft-1|    4|
+--------------------+-----+
only showing top 20 rows

