In [0]:
# ============================================================
# 02_Optimization_Validation
# Purpose: Prove Partition Pruning + Column Pruning via
#          Spark physical plan (explain)
# ============================================================

# ── Load Parquet ─────────────────────────────────────────────
PARQUET_10M_PATH = "/Volumes/workspace/default/raw_data/ecommerce_parquet"

df_pp_10M = spark.read.parquet(PARQUET_10M_PATH)

# ── Test 1: Partition Pruning ────────────────────────────────
print("=" * 60)
print("TEST 1: Partition Pruning")
print("Filter on 'year = 2024' — look for PartitionFilters in plan")
print("=" * 60)

df_pp_10M.filter("year = 2024").explain(True)

# ── Test 2: Multi-Column Partition Pruning ───────────────────
print("\n" + "=" * 60)
print("TEST 2: Multi-Column Partition Pruning")
print("Filter on year + month — should prune partitions further")
print("=" * 60)

df_pp_10M.filter("year = 2024 AND month = 6").explain(True)

# ── Test 3: Column Pruning ───────────────────────────────────
print("\n" + "=" * 60)
print("TEST 3: Column Pruning")
print("Select only 2 columns — verify ReadSchema contains only those")
print("=" * 60)

df_pp_10M.select("category", "final_price").explain(True)

# ── Test 4: Combined Pruning (Best Case) ─────────────────────
print("\n" + "=" * 60)
print("TEST 4: Combined Partition + Column Pruning")
print("This is maximum Parquet optimization")
print("=" * 60)

df_pp_10M \
    .filter("year = 2024") \
    .select("category", "final_price") \
    .explain(True)

print("\n✅ Optimization Validation Complete!")
print("""
Key things to verify in physical plan:
  → PartitionFilters: [isnotnull(year), (year = 2024)]
  → ReadSchema: struct<category:string, final_price:double>
  → Only relevant columns in ReadSchema (column pruning proof)
""")

TEST 1: Partition Pruning
Filter on 'year = 2024' — look for PartitionFilters in plan
== Parsed Logical Plan ==
'Filter ('year = 2024)
+- Relation [transaction_id#13469,user_id#13470,first_name#13471,last_name#13472,age#13473,gender#13474,phone#13475L,email#13476,city#13477,state#13478,pincode#13479,registration_date#13480,order_id#13481,order_date#13482,order_status#13483,order_channel#13484,product_id#13485,product_name#13486,category#13487,quantity#13488,price_per_unit#13489,total_price#13490,discount#13491,tax#13492,final_price#13493,... 33 more fields] parquet

== Analyzed Logical Plan ==
transaction_id: int, user_id: int, first_name: string, last_name: string, age: int, gender: string, phone: bigint, email: string, city: string, state: string, pincode: int, registration_date: date, order_id: int, order_date: date, order_status: string, order_channel: string, product_id: int, product_name: string, category: string, quantity: int, price_per_unit: double, total_price: double, discou