In [0]:
import time
from pyspark.sql.functions import col, count, countDistinct

start_time = time.time()

# Heavy Query
heavy_df = spark.table("workspace.ecommerce.ecommerce_delta") \
    .filter(col("brand").isNotNull()) \
    .groupBy("brand", "category_code") \
    .agg(
        count("*").alias("total_events"),
        countDistinct("user_id").alias("unique_users")
    ) \
    .orderBy(col("total_events").desc())

row_count = heavy_df.count()

end_time = time.time()
time_without_cache = end_time - start_time
print(f"Processed rows: {row_count}")
print(f"Time taken (No Cache): {time_without_cache:.2f} seconds")

In [0]:
# Analyze Explain Plan
heavy_df.explain(mode="formatted")

In [0]:
import time

start_time_1 = time.time()
heavy_df.count()
time_run_1 = time.time() - start_time_1
print(f"First Run Time: {time_run_1:.2f} seconds")

start_time_2 = time.time()
heavy_df.count()
time_run_2 = time.time() - start_time_2
print(f"Second Run Time: {time_run_2:.2f} seconds")

# Final Comparison
print("\nPerformance Comparison")
print(f"First Run (Uncached): {time_run_1:.2f} seconds")
print(f"Second Run (Auto-Cached): {time_run_2:.2f} seconds")

if time_run_2 > 0:
    speedup = time_run_1 / time_run_2
    print(f"Auto-Caching made our query {speedup:.1f}x faster")