In [1]:
# Enable horizontal scrolling:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = (
    SparkSession.builder.appName("Batch")  # type: ignore
    .master("spark://spark:7077")
    .config("spark.executor.cores", 1)
    .config("spark.executor.instances", 1)
    .config("spark.cores.max", 2)
    # Set configuration for Notebook display
    .config("spark.sql.repl.eagerEval.enabled", True)
    # Log level to WARN to avoid huge logs
    .config("spark.logConf", False)
    .getOrCreate()
)
# spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.sparkContext.setLogLevel("WARN")
spark

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /tmp/.ivy/cache
The jars for the packages stored in: /tmp/.ivy/jars
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ad3d458c-80d8-4ea0-9ff2-fb6e0206f8a7;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 

In [4]:
df = spark.read.json("s3a://raw-data/topics/OpenSeaRawEvents/year=*/month=*/day=*/hour=*/*.json.gz")
# print("Count: ", df.count())
# print("Partitions: ", df.rdd.getNumPartitions())
df

24/07/23 19:57:00 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

event,payload,ref,topic
item_metadata_upd...,{item_metadata_up...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_metadata_upd...,{item_metadata_up...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_metadata_upd...,{item_metadata_up...,,collection:*
item_transferred,{item_transferred...,,collection:*
item_metadata_upd...,{item_metadata_up...,,collection:*


In [5]:
# df.describe().show()
df.printSchema()

root
 |-- event: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- event_type: string (nullable = true)
 |    |-- payload: struct (nullable = true)
 |    |    |-- chain: string (nullable = true)
 |    |    |-- collection: struct (nullable = true)
 |    |    |    |-- slug: string (nullable = true)
 |    |    |-- event_timestamp: string (nullable = true)
 |    |    |-- from_account: struct (nullable = true)
 |    |    |    |-- address: string (nullable = true)
 |    |    |-- item: struct (nullable = true)
 |    |    |    |-- chain: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- metadata: struct (nullable = true)
 |    |    |    |    |-- animation_url: string (nullable = true)
 |    |    |    |    |-- background_color: string (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- image_url: string (nullable = true)
 |    |    |    |    |-- metadata_url: string (

In [8]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

df_events = df.select(
    "event",
    F.col("payload.event_type").alias("event_type"),
    F.col("payload.payload.collection.slug").alias("collection_slug"),
    F.to_timestamp("payload.sent_at").alias("sent_at"),
    F.col("payload.status").alias("status"),
    F.col("payload.payload.item.metadata.name").alias("item_name"),
    F.col("payload.payload.item.permalink").alias("item_url"),
    F.col("payload.payload.item.nft_id").alias("item_nft_id"),
    F.col("payload.payload.item.metadata.image_url").alias("image_url"),
    F.col("payload.payload.item.chain.name").alias("item_blockchain"),
    F.to_timestamp(F.col("payload.payload.listing_date")).alias("listing_date"),
    F.col("payload.payload.listing_type").alias("listing_type"),
    F.col("payload.payload.from_account.address").alias("from_account"),
    F.col("payload.payload.to_account.address").alias("to_account"),
    F.col("payload.payload.payment_token.symbol").alias("payment_symbol"),
    F.col("payload.payload.payment_token.eth_price").cast(T.DoubleType()).alias("eth_price"),
    F.col("payload.payload.payment_token.usd_price").cast(T.DoubleType()).alias("usd_price"),
    F.col("payload.payload.quantity").cast(T.IntegerType()).alias("quantity"),
).filter(F.col("event") != "phx_reply")
df_events.cache()
df_events.show(truncate=False)

AnalysisException: [FIELD_NOT_FOUND] No such struct field `payment_token` in `chain`, `collection`, `event_timestamp`, `from_account`, `item`, `quantity`, `to_account`, `transaction`.

In [None]:
df_events.printSchema()

NameError: name 'df_events' is not defined

# Exploratory Analysis

In [None]:
df_events.select(F.max("sent_at"), F.min("sent_at")).show(truncate=False)



+--------------------------+--------------------------+
|max(sent_at)              |min(sent_at)              |
+--------------------------+--------------------------+
|2024-07-21 10:56:00.299731|2024-07-20 15:32:21.847767|
+--------------------------+--------------------------+



                                                                                

In [None]:
df_events.groupBy("event_type").count().show(truncate=False)



+---------------------+------+
|event_type           |count |
+---------------------+------+
|item_transferred     |104352|
|item_metadata_updated|8259  |
|item_listed          |5     |
|item_cancelled       |1     |
|item_sold            |2     |
|order_invalidate     |2     |
+---------------------+------+



                                                                                

In [None]:
df_events.filter((F.col("usd_price").isNotNull())).groupBy("event_type").count().show(truncate=False)



+-----------+-----+
|event_type |count|
+-----------+-----+
|item_listed|4    |
|item_sold  |2    |
+-----------+-----+



                                                                                

In [None]:
df_events.filter((F.col("event_type") == "item_sold")).limit(5)

                                                                                

event,event_type,collection_slug,sent_at,status,item_name,item_url,item_nft_id,image_url,item_blockchain,listing_date,listing_type,from_account,to_account,payment_symbol,eth_price,usd_price,quantity
item_sold,item_sold,meta-croak-not-wi...,2024-07-20 15:41:...,,Meta Croak - Not ...,https://testnets....,sepolia/0x50976f5...,https://i.seadn.i...,sepolia,,,,,ETH,1.0,3505.510000000000...,1
item_sold,item_sold,cryptoverse-contract,2024-07-20 15:54:...,,Blue,https://testnets....,avalanche_fuji/0x...,,avalanche_fuji,,,,,AVAX,0.00807959,28.32000000000000...,1


In [None]:
df_events.filter((F.col("event_type") == "item_listed")).limit(5)

event,event_type,collection_slug,sent_at,status,item_name,item_url,item_nft_id,image_url,item_blockchain,listing_date,listing_type,from_account,to_account,payment_symbol,eth_price,usd_price,quantity
item_listed,item_listed,meta-croak-not-wi...,2024-07-20 15:36:...,,Meta Croak - Not ...,https://testnets....,sepolia/0x50976f5...,https://i.seadn.i...,sepolia,2024-07-20 15:35:57,,,,ETH,1.0,3505.51,1
item_listed,item_listed,cyan-bayc-2,2024-07-20 15:43:...,,,https://testnets....,sepolia/0x300b105...,https://i.seadn.i...,sepolia,2024-07-20 15:43:24,,,,ETH,1.0,3505.51,1
item_listed,item_listed,unidentified-cont...,2024-07-20 19:01:...,,,https://testnets....,avalanche_fuji/0x...,,avalanche_fuji,2024-07-20 19:01:04,,,,AVAX,0.00819324,28.95,1
item_listed,item_listed,testseiberians9,2024-07-21 09:47:...,,Unrevealed Seiber...,https://testnets....,sepolia/0xb37c71e...,https://i.seadn.i...,sepolia,2024-07-21 09:47:41,,,,ETH,1.0,3494.29,1
item_listed,item_listed,unidentified-cont...,2024-07-20 19:15:...,,,https://testnets....,avalanche_fuji/0x...,,avalanche_fuji,2024-07-20 19:15:27,,,,AVAX,0.00819324,28.95,1


In [None]:
df_events.filter((F.col("event_type") == "item_received_bid")).limit(5)

event,event_type,collection_slug,sent_at,status,item_name,item_url,image_url,item_blockchain,from_account,to_account,payment_symbol,eth_price,usd_price,quantity


In [None]:
df_events.filter((F.col("event_type") == "item_transferred")).show(truncate=False)

+----------------+----------------+--------------------------------------------------+--------------------------+------+------------------------------+------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+----------------+------------------------------------------+------------------------------------------+--------------+---------+---------+--------+
|event           |event_type      |collection_slug                                   |sent_at                   |status|item_name                     |item_url                                                                                              |image_url                                                                            |item_blockchain |from_account                              |to_account                                |payment_symbol|eth_price|usd_price|quantity|
+----------------+------

# Global Metrics

In [None]:
time_frame = "1 hour"
time_frame_txt = "_".join(time_frame.split())
time_window = F.window("sent_at", time_frame)

## Marketplace transactions stats

In [None]:
transferred_items = df_events.filter((F.col("event_type") == "item_transferred"))
transferred_items = transferred_items.withColumn(
    "quantity", F.when(F.col("quantity").cast("int") > 0, F.col("quantity").cast("int")).otherwise(0)
)
transferred_items.printSchema()
transferred_items.show(truncate=False)

root
 |-- event: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- sent_at: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_url: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- item_blockchain: string (nullable = true)
 |-- from_account: string (nullable = true)
 |-- to_account: string (nullable = true)
 |-- payment_symbol: string (nullable = true)
 |-- eth_price: string (nullable = true)
 |-- usd_price: string (nullable = true)
 |-- quantity: integer (nullable = true)

+----------------+----------------+--------------------------------------------------+--------------------------+------+------------------------------+------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+----------------+-------

In [None]:
windowed_transactions = (
    transferred_items
    .groupBy(time_window)
    .agg(
        F.count("*").alias("transfers_count"),
        F.sum("quantity").alias("items_transferred_count"),
    )
    .orderBy("window")
    .select(
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        "transfers_count",
        "items_transferred_count",
    )
)
windowed_transactions.show(truncate=False)



+-------------------+-------------------+---------------+-----------------------+
|window_start       |window_end         |transfers_count|items_transferred_count|
+-------------------+-------------------+---------------+-----------------------+
|2024-07-20 10:00:00|2024-07-20 11:00:00|4936           |57235                  |
|2024-07-20 11:00:00|2024-07-20 12:00:00|11568          |9184401977             |
+-------------------+-------------------+---------------+-----------------------+



                                                                                

In [None]:
windowed_transactions_events = windowed_transactions.unpivot(
    ["window_start", "window_end"],
    ["transfers_count", "items_transferred_count"],
    "metric",
    "value"
).select(
    F.concat("metric", F.lit(f"__{time_frame_txt}")).alias("metric"),
    F.col("window_end").alias("timestamp"),
    "value",
    F.lit(None).alias("collection"),
)
windowed_transactions_events.show(truncate=False)

                                                                                

+-------------------------------+-------------------+----------+----------+
|metric                         |timestamp          |value     |collection|
+-------------------------------+-------------------+----------+----------+
|transfers_count__1_hour        |2024-07-20 11:00:00|4936      |NULL      |
|items_transferred_count__1_hour|2024-07-20 11:00:00|57235     |NULL      |
|transfers_count__1_hour        |2024-07-20 12:00:00|11568     |NULL      |
|items_transferred_count__1_hour|2024-07-20 12:00:00|9184401977|NULL      |
+-------------------------------+-------------------+----------+----------+



24/07/20 13:15:20 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-68702f9a-49f7-4b3c-a4a4-14227d12cb09. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-68702f9a-49f7-4b3c-a4a4-14227d12cb09
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:173)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:109)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:90)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively(SparkFileUtils.scala:121)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively$(SparkFileUtils.scala:120)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1126)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:364)
	at scala.collection.IndexedSeqOptimize

## Marketplace Sales Volume over time

In [None]:
sold_items = df_events.filter((F.col("event_type") == "item_sold"))
# Sales volume over time (Every hour)
sold_items_hourly = (
    sold_items.groupBy(time_window)
    .agg(F.sum("usd_price").alias("usd_volume"), F.count("*").alias("sales_count"))
    .orderBy("window")
    .select(
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        "usd_volume",
        "sales_count",
    )
)
sold_items_hourly.show(truncate=False)



+-------------------+-------------------+----------+-----------+
|window_start       |window_end         |usd_volume|sales_count|
+-------------------+-------------------+----------+-----------+
|2024-07-20 11:00:00|2024-07-20 12:00:00|0.534207  |1          |
+-------------------+-------------------+----------+-----------+



                                                                                

In [None]:
sold_items_hourly_events = sold_items_hourly.select(
    F.lit(f"total_volume__{time_frame_txt}").alias("metric"),
    F.col("window_end").alias("timestamp"),
    F.col("usd_volume").alias("value"),
    F.lit(None).alias("collection"),
)
sold_items_hourly_events

                                                                                

metric,timestamp,value,collection
total_volume__1_hour,2024-07-20 12:00:00,0.534207,


## Top Collections by sales volume

In [None]:
from pyspark.sql.window import Window

top_collections = (
    sold_items.groupby("collection_slug", time_window)
    .agg(F.sum("usd_price").alias("usd_volume"), F.count("*").alias("sales_count"))
    .orderBy(F.desc("usd_volume"))
    .select(
        "collection_slug",
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        "usd_volume",
        "sales_count",
    )
).withColumn(
    "window_rank",
    (
        F.row_number().over(
            Window.partitionBy("window_start", "window_end").orderBy(
                F.desc("usd_volume")
            )
        )
    ),
)
top_collections.show(truncate=False)



+---------------+-------------------+-------------------+----------+-----------+-----------+
|collection_slug|window_start       |window_end         |usd_volume|sales_count|window_rank|
+---------------+-------------------+-------------------+----------+-----------+-----------+
|hhhhhhhh1155   |2024-07-20 11:00:00|2024-07-20 12:00:00|0.534207  |1          |1          |
+---------------+-------------------+-------------------+----------+-----------+-----------+



                                                                                

In [None]:
top_collections_events = top_collections.select(
    F.lit(f"top_collections_by_volume__{time_frame_txt}").alias("metric"),
    F.col("window_end").alias("timestamp"),
    F.col("usd_volume").alias("value"),
    F.col("collection_slug").alias("collection"),
).filter(F.col("window_rank") <= 10)
top_collections_events.show(truncate=False)



+---------------------------------+-------------------+--------+------------+
|metric                           |timestamp          |value   |collection  |
+---------------------------------+-------------------+--------+------------+
|top_collections_by_volume__1_hour|2024-07-20 12:00:00|0.534207|hhhhhhhh1155|
+---------------------------------+-------------------+--------+------------+



                                                                                

# Collections Metrics

## Top Collections most valuable assets sold

In [None]:
top_collections_list = (
    top_collections.select("collection_slug")
    .distinct()
    .rdd.flatMap(lambda x: x)
    .collect()
)

                                                                                

In [None]:
top_collections_sales = sold_items.filter(
    F.col("collection_slug").isin(top_collections_list)
)
top_collections_assets = (
    top_collections_sales.join(
        top_collections,
        how="left",
        on=(
            (
                top_collections_sales["sent_at"].between(
                    top_collections["window_start"], top_collections["window_end"]
                )
            )
            & (
                top_collections_sales["collection_slug"]
                == top_collections["collection_slug"]
            )
        ),
    )
    .select(
        top_collections_sales["sent_at"],
        top_collections_sales["collection_slug"],
        top_collections_sales["item_name"],
        top_collections_sales["item_url"],
        top_collections_sales["image_url"],
        top_collections_sales["usd_price"],
        top_collections["window_start"],
        top_collections["window_end"],
    )
    .withColumn(
        "rank_by_price",
        F.row_number().over(
            Window.partitionBy("collection_slug", "window_start", "window_end").orderBy(
                F.desc("usd_price")
            )
        ),
    )
)
top_collections_assets

                                                                                

sent_at,collection_slug,item_name,item_url,image_url,usd_price,window_start,window_end,rank_by_price
2024-07-20 11:02:...,hhhhhhhh1155,web3game1155,https://testnets....,https://i.seadn.i...,0.534207,2024-07-20 11:00:00,2024-07-20 12:00:00,1


In [None]:
top_collections_assets_events = top_collections_assets.select(
    F.lit(f"collection_top_assets_by_price__{time_frame_txt}").alias("metric"),
    F.col("window_end").alias("timestamp"),
    F.col("collection_slug").alias("collection"),
    F.col("usd_price").alias("value"),
    F.col("item_name").alias("asset_name"),
    F.col("item_url").alias("asset_url"),
    "image_url",
).filter(F.col("rank_by_price") <= 20)
top_collections_assets_events.show(truncate=False)



+--------------------------------------+-------------------+------------+--------------------+------------+------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|metric                                |timestamp          |collection  |value               |asset_name  |asset_url                                                                           |image_url                                                                            |
+--------------------------------------+-------------------+------------+--------------------+------------+------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|collection_top_assets_by_price__1_hour|2024-07-20 12:00:00|hhhhhhhh1155|0.534207000000000000|web3game1155|https://testnets.opensea.io/assets/amoy/0xa4ddd7fc0aa185

                                                                                

## Collection Stats

In [None]:
top_collections_stats = top_collections_assets.groupby(
    "collection_slug",
    "window_start",
    "window_end"
).agg(
    F.min("usd_price").cast("double").alias("floor_price"),
    F.sum("usd_price").cast("double").alias("total_volume"),
    F.avg("usd_price").cast("double").alias("avg_assets_price"),
    F.count("*").cast("double").alias("total_sales"),
)
top_collections_stats

                                                                                

collection_slug,window_start,window_end,floor_price,total_volume,avg_assets_price,total_sales
hhhhhhhh1155,2024-07-20 11:00:00,2024-07-20 12:00:00,0.534207,0.534207,0.534207,1.0


In [None]:
top_collections_stats_events = top_collections_stats.unpivot(
    ["collection_slug", "window_start", "window_end"],
    ["floor_price", "total_volume", "avg_assets_price", "total_sales"],
    "metric",
    "value",
).select(
    F.concat(F.lit("collection_"), F.col("metric"), F.lit(f"__{time_frame_txt}")).alias("metric"),
    F.col("window_end").alias("timestamp"),
    F.col("collection_slug").alias("collection"),
    "value",
    F.lit(None).alias("asset_name"),
    F.lit(None).alias("asset_url"),
    F.lit(None).alias("image_url"),
)
top_collections_stats_events.show(truncate=False)



+-----------------------------------+-------------------+------------+--------+----------+---------+---------+
|metric                             |timestamp          |collection  |value   |asset_name|asset_url|image_url|
+-----------------------------------+-------------------+------------+--------+----------+---------+---------+
|collection_floor_price__1_hour     |2024-07-20 12:00:00|hhhhhhhh1155|0.534207|NULL      |NULL     |NULL     |
|collection_total_volume__1_hour    |2024-07-20 12:00:00|hhhhhhhh1155|0.534207|NULL      |NULL     |NULL     |
|collection_avg_assets_price__1_hour|2024-07-20 12:00:00|hhhhhhhh1155|0.534207|NULL      |NULL     |NULL     |
|collection_total_sales__1_hour     |2024-07-20 12:00:00|hhhhhhhh1155|1.0     |NULL      |NULL     |NULL     |
+-----------------------------------+-------------------+------------+--------+----------+---------+---------+



                                                                                

## Merge both datasets

In [None]:
top_collections_stats_events.union(top_collections_assets_events).show(truncate=False)



+--------------------------------------+-------------------+------------+--------------------+------------+------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|metric                                |timestamp          |collection  |value               |asset_name  |asset_url                                                                           |image_url                                                                            |
+--------------------------------------+-------------------+------------+--------------------+------------+------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|collection_floor_price__1_hour        |2024-07-20 12:00:00|hhhhhhhh1155|0.534207            |NULL        |NULL                                                    

                                                                                