In [0]:
# ============================================================
# 03_Benchmark_Suite
# Purpose: Pure timing benchmarks — CSV vs 10M PP vs 100M PP
#          Task1: Filter + Aggregation
#          Stress1: High-Cardinality GroupBy
#          Stress2: Window Function (Row Number)
#          Nuclear: Double Shuffle (Repartition + Window + Agg)
# ============================================================

import time
from pyspark.sql.functions import col, year, month, row_number, sum as _sum
from pyspark.sql.window import Window

# ── Config ───────────────────────────────────────────────────
spark.conf.set("spark.sql.shuffle.partitions", 400)

CSV_PATH          = "/Volumes/workspace/default/raw_data/ecommerce_10M_55cols.csv"
PARQUET_10M_PATH  = "/Volumes/workspace/default/raw_data/ecommerce_parquet"
PARQUET_100M_PATH = "/Volumes/workspace/default/raw_data/ecommerce_100M_parquet"

# ── Load All Datasets ────────────────────────────────────────
print("Loading datasets...")

df_csv_10M  = spark.read.option("header","true").option("inferSchema","true").csv(CSV_PATH)
df_pp_10M   = spark.read.parquet(PARQUET_10M_PATH)
df_pp_100M  = spark.read.parquet(PARQUET_100M_PATH)

print(f"CSV 10M  rows: {df_csv_10M.count():,}")
print(f"PP  10M  rows: {df_pp_10M.count():,}")
print(f"PP  100M rows: {df_pp_100M.count():,}")

# ── Benchmark Wrapper ────────────────────────────────────────
def benchmark(label, func):
    start    = time.time()
    result   = func()
    duration = round(time.time() - start, 2)
    print(f"  {label:<35} → {duration} seconds")
    return duration

# ── Task 1: Filter + Multi-Column Aggregation ────────────────
print("\n" + "=" * 60)
print("TASK 1: Filter + Aggregation")
print("=" * 60)

def task1(df):
    df_temp = df
    if "year" not in df.columns:
        df_temp = df_temp \
            .withColumn("year",  year(col("order_date"))) \
            .withColumn("month", month(col("order_date")))
    return df_temp \
        .filter((col("year") >= 2023) & (col("month") >= 6) & (col("discount") > 500)) \
        .groupBy("city", "category") \
        .agg({"final_price": "sum", "quantity": "sum"}) \
        .count()

t1_csv  = benchmark("CSV  10M  Task1", lambda: task1(df_csv_10M))
t1_pp10 = benchmark("PP   10M  Task1", lambda: task1(df_pp_10M))
t1_pp100= benchmark("PP   100M Task1", lambda: task1(df_pp_100M))

print(f"\n  Speedup (CSV → PP 10M)  : {round(t1_csv / t1_pp10, 1)}x faster")
print(f"  Scaling (10M → 100M PP) : {round(t1_pp100 / t1_pp10, 1)}x (expect ~10x for linear)")

# ── Stress 1: High-Cardinality GroupBy ──────────────────────
print("\n" + "=" * 60)
print("STRESS 1: High-Cardinality GroupBy (4 dimensions)")
print("=" * 60)

def stress1(df):
    return df \
        .groupBy("city", "category", "payment_method", "product_id") \
        .agg({"final_price": "sum"}) \
        .count()

s1_csv  = benchmark("CSV  10M  Stress1", lambda: stress1(df_csv_10M))
s1_pp10 = benchmark("PP   10M  Stress1", lambda: stress1(df_pp_10M))
s1_pp100= benchmark("PP   100M Stress1", lambda: stress1(df_pp_100M))

print(f"\n  Speedup (CSV → PP 10M)  : {round(s1_csv / s1_pp10, 1)}x faster")

# ── Stress 2: Window Function (Row Number per User) ──────────
print("\n" + "=" * 60)
print("STRESS 2: Window Function — Row Number per user_id")
print("=" * 60)

def stress2(df):
    window = Window.partitionBy("user_id").orderBy(col("final_price").desc())
    return df.withColumn("rank", row_number().over(window)).count()

s2_csv  = benchmark("CSV  10M  Stress2", lambda: stress2(df_csv_10M))
s2_pp10 = benchmark("PP   10M  Stress2", lambda: stress2(df_pp_10M))
s2_pp100= benchmark("PP   100M Stress2", lambda: stress2(df_pp_100M))

print(f"\n  Speedup (CSV → PP 10M)  : {round(s2_csv / s2_pp10, 1)}x faster")

# ── Nuclear: Double-Shuffle Stress Test ──────────────────────
print("\n" + "=" * 60)
print("NUCLEAR: Repartition(800) + Window + GroupBy (2 shuffles)")
print("=" * 60)

def nuclear_stress(df):
    df2      = df.select("user_id","product_id","city","category","payment_method","final_price")
    shuffled = df2.repartition(800, "user_id")
    window   = Window.partitionBy("user_id").orderBy(col("final_price").desc())
    ranked   = shuffled.withColumn("rank", row_number().over(window))
    result   = ranked.groupBy("city","category","payment_method").agg(_sum("final_price"))
    return result.count()

n_csv   = benchmark("CSV  10M  Nuclear", lambda: nuclear_stress(df_csv_10M))
n_pp10  = benchmark("PP   10M  Nuclear", lambda: nuclear_stress(df_pp_10M))
n_pp100 = benchmark("PP   100M Nuclear", lambda: nuclear_stress(df_pp_100M))

print(f"\n  Speedup (CSV → PP 10M)  : {round(n_csv / n_pp10, 1)}x faster")

# ── Final Summary ─────────────────────────────────────────────
print("\n" + "=" * 60)
print("BENCHMARK SUMMARY")
print("=" * 60)
print(f"{'Workload':<35} {'CSV 10M':>10} {'PP 10M':>10} {'PP 100M':>10}")
print("-" * 65)
print(f"{'Task1  (Filter + Agg)':<35} {t1_csv:>10} {t1_pp10:>10} {t1_pp100:>10}")
print(f"{'Stress1 (High-Card GroupBy)':<35} {s1_csv:>10} {s1_pp10:>10} {s1_pp100:>10}")
print(f"{'Stress2 (Window Function)':<35} {s2_csv:>10} {s2_pp10:>10} {s2_pp100:>10}")
print(f"{'Nuclear (Double Shuffle)':<35} {n_csv:>10} {n_pp10:>10} {n_pp100:>10}")
print("=" * 60)
print("✅ Benchmark Complete!")

Loading datasets...
CSV 10M  rows: 10,000,000
PP  10M  rows: 10,000,000
PP  100M rows: 100,000,000

TASK 1: Filter + Aggregation
  CSV  10M  Task1                     → 9.8 seconds
  PP   10M  Task1                     → 1.7 seconds
  PP   100M Task1                     → 18.43 seconds

  Speedup (CSV → PP 10M)  : 5.8x faster
  Scaling (10M → 100M PP) : 10.8x (expect ~10x for linear)

STRESS 1: High-Cardinality GroupBy (4 dimensions)
  CSV  10M  Stress1                   → 8.61 seconds
  PP   10M  Stress1                   → 2.71 seconds
  PP   100M Stress1                   → 34.06 seconds

  Speedup (CSV → PP 10M)  : 3.2x faster

STRESS 2: Window Function — Row Number per user_id
  CSV  10M  Stress2                   → 5.62 seconds
  PP   10M  Stress2                   → 1.15 seconds
  PP   100M Stress2                   → 3.68 seconds

  Speedup (CSV → PP 10M)  : 4.9x faster

NUCLEAR: Repartition(800) + Window + GroupBy (2 shuffles)
  CSV  10M  Nuclear                   → 9.88 secon