In [None]:
# lakehouse/notebooks/transform_gas_data.ipynb
# Databricks notebook source
# MAGIC %md
# MAGIC # Ethereum Gas Data Transformation
# MAGIC 
# MAGIC This notebook transforms raw Ethereum gas price data from the lakehouse raw tables
# MAGIC and creates curated and aggregated tables for analysis.

# COMMAND ----------

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import *

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("EthereumGasTransformation") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Read Raw Gas Data

# COMMAND ----------

# Read raw gas data from lakehouse
raw_gas_df = spark.read \
    .format("delta") \
    .load("Tables/raw/eth_gas_raw")

In [None]:
# Show schema and sample data
print("Raw gas data schema:")
raw_gas_df.printSchema()
print("\nSample data:")
raw_gas_df.show(5)

In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Quality Checks

# COMMAND ----------

# Data quality checks
print("Data Quality Checks:")
print(f"Total records: {raw_gas_df.count()}")
print(f"Null timestamps: {raw_gas_df.filter(col('timestamp').isNull()).count()}")
print(f"Invalid gas prices (> 1000 gwei): {raw_gas_df.filter(col('fast_gas_price') > 1000).count()}")

In [None]:
# Check for duplicates
duplicate_count = raw_gas_df.groupBy("timestamp").count().filter(col("count") > 1).count()
print(f"Duplicate timestamps: {duplicate_count}")


In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Transform to Curated Data

# COMMAND ----------

# Transform raw data to curated format
curated_gas_df = raw_gas_df \
    .withColumn("timestamp_dt", to_timestamp(col("timestamp"))) \
    .withColumn("date", date_format(col("timestamp_dt"), "yyyy-MM-dd")) \
    .withColumn("hour", hour(col("timestamp_dt"))) \
    .withColumn("day_of_week", dayofweek(col("timestamp_dt"))) \
    .withColumn("is_weekend", when(col("day_of_week").isin([1, 7]), True).otherwise(False)) \
    .withColumn("gas_price_category", 
               when(col("fast_gas_price") < 20, "Low")
               .when(col("fast_gas_price") < 50, "Medium")
               .when(col("fast_gas_price") < 100, "High")
               .otherwise("Very High")) \
    .filter(col("fast_gas_price").isNotNull() & (col("fast_gas_price") > 0))

In [None]:
# Add moving averages
window_1h = Window.orderBy("timestamp_dt").rowsBetween(-11, 0)  # 1 hour window (5min intervals)
window_24h = Window.orderBy("timestamp_dt").rowsBetween(-287, 0)  # 24 hour window

curated_gas_df = curated_gas_df \
    .withColumn("safe_gas_ma_1h", avg("safe_gas_price").over(window_1h)) \
    .withColumn("standard_gas_ma_1h", avg("standard_gas_price").over(window_1h)) \
    .withColumn("fast_gas_ma_1h", avg("fast_gas_price").over(window_1h)) \
    .withColumn("safe_gas_ma_24h", avg("safe_gas_price").over(window_24h)) \
    .withColumn("standard_gas_ma_24h", avg("standard_gas_price").over(window_24h)) \
    .withColumn("fast_gas_ma_24h", avg("fast_gas_price").over(window_24h))

print("Curated gas data schema:")
curated_gas_df.printSchema()

In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Write Curated Data

# COMMAND ----------

# Write curated data to delta table
curated_gas_df.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("date") \
    .save("Tables/curated/eth_gas_curated")

print("Curated gas data written successfully")

In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Create Aggregated Views

# COMMAND ----------

# Daily aggregations
daily_gas_agg = curated_gas_df \
    .groupBy("date") \
    .agg(
        avg("safe_gas_price").alias("avg_safe_gas"),
        avg("standard_gas_price").alias("avg_standard_gas"),
        avg("fast_gas_price").alias("avg_fast_gas"),
        min("safe_gas_price").alias("min_safe_gas"),
        max("fast_gas_price").alias("max_fast_gas"),
        stddev("fast_gas_price").alias("fast_gas_volatility"),
        count("*").alias("data_points"),
        sum(when(col("gas_price_category") == "Low", 1).otherwise(0)).alias("low_gas_periods"),
        sum(when(col("gas_price_category") == "Very High", 1).otherwise(0)).alias("high_gas_periods")
    ) \
    .withColumn("gas_volatility_category",
               when(col("fast_gas_volatility") < 10, "Stable")
               .when(col("fast_gas_volatility") < 25, "Moderate")
               .otherwise("Volatile"))

In [None]:
# Hourly aggregations
hourly_gas_agg = curated_gas_df \
    .groupBy("date", "hour") \
    .agg(
        avg("safe_gas_price").alias("avg_safe_gas"),
        avg("standard_gas_price").alias("avg_standard_gas"),
        avg("fast_gas_price").alias("avg_fast_gas"),
        min("safe_gas_price").alias("min_safe_gas"),
        max("fast_gas_price").alias("max_fast_gas"),
        count("*").alias("data_points")
    ) \
    .withColumn("hour_category",
               when(col("hour").between(0, 5), "Night")
               .when(col("hour").between(6, 11), "Morning")
               .when(col("hour").between(12, 17), "Afternoon")
               .otherwise("Evening"))

In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Write Aggregated Data

# COMMAND ----------

# Write daily aggregations
daily_gas_agg.write \
    .format("delta") \
    .mode("overwrite") \
    .save("Tables/aggregated/eth_gas_daily")

# Write hourly aggregations
hourly_gas_agg.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("date") \
    .save("Tables/aggregated/eth_gas_hourly")

print("Aggregated gas data written successfully")

In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Validation and Summary

# COMMAND ----------

# Summary statistics
print("=== TRANSFORMATION SUMMARY ===")
print(f"Raw records processed: {raw_gas_df.count()}")
print(f"Curated records created: {curated_gas_df.count()}")
print(f"Daily aggregation records: {daily_gas_agg.count()}")
print(f"Hourly aggregation records: {hourly_gas_agg.count()}")

In [None]:
# Show sample of final aggregated data
print("\nDaily Gas Price Summary (Last 5 days):")
daily_gas_agg.orderBy(desc("date")).show(5)

print("\nHourly Gas Price Summary (Last 24 hours):")
hourly_gas_agg.filter(col("date") == daily_gas_agg.agg(max("date")).collect()[0][0]) \
    .orderBy("hour").show(24)

In [None]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Create Views for Real-time Queries

# COMMAND ----------

# Create temporary views for real-time dashboard queries
curated_gas_df.createOrReplaceTempView("gas_data_realtime")
daily_gas_agg.createOrReplaceTempView("gas_data_daily")
hourly_gas_agg.createOrReplaceTempView("gas_data_hourly")

print("Temporary views created for real-time queries")

# COMMAND ----------

# MAGIC %sql
# MAGIC -- Create optimized delta tables with Z-ordering
# MAGIC OPTIMIZE delta.`Tables/curated/eth_gas_curated` ZORDER BY (timestamp_dt);
# MAGIC OPTIMIZE delta.`Tables/aggregated/eth_gas_daily` ZORDER BY (date);
# MAGIC OPTIMIZE delta.`Tables/aggregated/eth_gas_hourly` ZORDER BY (date, hour);