In [0]:
# ============================================================
# 04_Cost_And_IO_Analysis
# Purpose:
#   1. Compare CSV vs Parquet storage size
#   2. Estimate Column Pruning IO reduction
#   3. Estimate Partition Pruning IO reduction
#   4. Estimate Combined IO reduction
#   5. Translate IO savings into cloud cost impact
# ============================================================

from pyspark.sql.functions import col

# ─────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────

CSV_PATH          = "/Volumes/workspace/default/raw_data/ecommerce_10M_55cols.csv"
PARQUET_10M_PATH  = "/Volumes/workspace/default/raw_data/ecommerce_parquet"

TOTAL_COLUMNS     = 55
SELECTED_COLUMNS  = 2   # example query: category, final_price

COST_PER_TB       = 5   # assumed warehouse cost ($ per TB scanned)

# ─────────────────────────────────────────────────────────────
# UTILITY FUNCTIONS
# ─────────────────────────────────────────────────────────────

def bytes_to_gb(bytes_val):
    return bytes_val / (1024**3)

def bytes_to_tb(bytes_val):
    return bytes_val / (1024**4)

def format_gb(bytes_val):
    return round(bytes_to_gb(bytes_val), 2)

def estimate_cost(bytes_val):
    return round(bytes_to_tb(bytes_val) * COST_PER_TB, 4)

# Recursive folder size calculation (IMPORTANT FIX)
def get_folder_size_recursive(path):
    total_size = 0
    items = dbutils.fs.ls(path)

    for item in items:
        if item.isDir():
            total_size += get_folder_size_recursive(item.path)
        else:
            total_size += item.size

    return total_size

# ─────────────────────────────────────────────────────────────
# STEP 1: STORAGE SIZE COMPARISON
# ─────────────────────────────────────────────────────────────

print("=" * 60)
print("STEP 1: Storage Size Comparison")
print("=" * 60)

# CSV size
csv_info = dbutils.fs.ls(CSV_PATH)[0]
csv_size_bytes = csv_info.size

# Parquet size (recursive)
parquet_size_bytes = get_folder_size_recursive(PARQUET_10M_PATH)

print(f"CSV Size (GB)     : {format_gb(csv_size_bytes)} GB")
print(f"Parquet Size (GB) : {format_gb(parquet_size_bytes)} GB")

if parquet_size_bytes > 0:
    compression_ratio = round(csv_size_bytes / parquet_size_bytes, 2)
    storage_reduction_pct = round(
        (1 - parquet_size_bytes / csv_size_bytes) * 100, 2
    )

    print(f"Compression Ratio : {compression_ratio}x smaller")
    print(f"Storage Reduction : {storage_reduction_pct}%")
else:
    print("⚠ Parquet size returned 0. Check path.")

# ─────────────────────────────────────────────────────────────
# STEP 2: COLUMN PRUNING IO REDUCTION
# ─────────────────────────────────────────────────────────────

print("\n" + "=" * 60)
print("STEP 2: Column Pruning IO Reduction Estimate")
print("=" * 60)

column_fraction = SELECTED_COLUMNS / TOTAL_COLUMNS
column_reduction_pct = round((1 - column_fraction) * 100, 2)

estimated_scan_bytes_column = parquet_size_bytes * column_fraction

print(f"Total Columns            : {TOTAL_COLUMNS}")
print(f"Selected Columns         : {SELECTED_COLUMNS}")
print(f"Column Scan Fraction     : {round(column_fraction * 100, 2)}%")
print(f"Estimated Scan Size (GB) : {format_gb(estimated_scan_bytes_column)} GB")
print(f"IO Reduction via Columns : {column_reduction_pct}%")

# ─────────────────────────────────────────────────────────────
# STEP 3: PARTITION PRUNING IO REDUCTION
# ─────────────────────────────────────────────────────────────

print("\n" + "=" * 60)
print("STEP 3: Partition Pruning IO Reduction Estimate")
print("=" * 60)

df_pp_10M = spark.read.parquet(PARQUET_10M_PATH)

distinct_years = df_pp_10M.select("year").distinct().count()
distinct_months = df_pp_10M.select("month").distinct().count()

print(f"Distinct Years  : {distinct_years}")
print(f"Distinct Months : {distinct_months}")

# Year-only filter
year_fraction = 1 / distinct_years
year_reduction_pct = round((1 - year_fraction) * 100, 2)

estimated_scan_year_bytes = parquet_size_bytes * year_fraction

print(f"\nIf filtering on 1 year:")
print(f"Partition Scan Fraction   : {round(year_fraction * 100, 2)}%")
print(f"Estimated Scan Size (GB)  : {format_gb(estimated_scan_year_bytes)} GB")
print(f"IO Reduction via Year     : {year_reduction_pct}%")

# Year + Month filter
combined_partition_fraction = 1 / (distinct_years * distinct_months)
combined_partition_reduction = round(
    (1 - combined_partition_fraction) * 100, 2
)

estimated_scan_partition_bytes = parquet_size_bytes * combined_partition_fraction

print(f"\nIf filtering on 1 year + 1 month:")
print(f"Partition Scan Fraction   : {round(combined_partition_fraction * 100, 4)}%")
print(f"Estimated Scan Size (GB)  : {format_gb(estimated_scan_partition_bytes)} GB")
print(f"IO Reduction via Partition: {combined_partition_reduction}%")

# ─────────────────────────────────────────────────────────────
# STEP 4: COMBINED COLUMN + PARTITION REDUCTION
# ─────────────────────────────────────────────────────────────

print("\n" + "=" * 60)
print("STEP 4: Combined Column + Partition Reduction")
print("=" * 60)

combined_fraction = column_fraction * year_fraction
combined_reduction_pct = round((1 - combined_fraction) * 100, 2)

estimated_combined_bytes = parquet_size_bytes * combined_fraction

print(f"Combined Scan Fraction      : {round(combined_fraction * 100, 4)}%")
print(f"Estimated Scan Size (GB)    : {format_gb(estimated_combined_bytes)} GB")
print(f"Total Estimated IO Reduction: {combined_reduction_pct}%")

# ─────────────────────────────────────────────────────────────
# STEP 5: CLOUD COST IMPACT ESTIMATE
# ─────────────────────────────────────────────────────────────

print("\n" + "=" * 60)
print("STEP 5: Estimated Cloud Cost Impact")
print("=" * 60)

csv_cost = estimate_cost(csv_size_bytes)
parquet_cost = estimate_cost(parquet_size_bytes)
optimized_cost = estimate_cost(estimated_combined_bytes)

print(f"Estimated Cost (CSV full scan)        : ${csv_cost}")
print(f"Estimated Cost (Parquet full scan)    : ${parquet_cost}")
print(f"Estimated Cost (Optimized query scan) : ${optimized_cost}")

print(f"\nEstimated Cost Savings vs CSV: ${round(csv_cost - optimized_cost, 4)}")

print("\n✅ Cost & IO Analysis Complete!")

STEP 1: Storage Size Comparison
CSV Size (GB)     : 3.79 GB
Parquet Size (GB) : 1.25 GB
Compression Ratio : 3.02x smaller
Storage Reduction : 66.9%

STEP 2: Column Pruning IO Reduction Estimate
Total Columns            : 55
Selected Columns         : 2
Column Scan Fraction     : 3.64%
Estimated Scan Size (GB) : 0.05 GB
IO Reduction via Columns : 96.36%

STEP 3: Partition Pruning IO Reduction Estimate
Distinct Years  : 4
Distinct Months : 12

If filtering on 1 year:
Partition Scan Fraction   : 25.0%
Estimated Scan Size (GB)  : 0.31 GB
IO Reduction via Year     : 75.0%

If filtering on 1 year + 1 month:
Partition Scan Fraction   : 2.0833%
Estimated Scan Size (GB)  : 0.03 GB
IO Reduction via Partition: 97.92%

STEP 4: Combined Column + Partition Reduction
Combined Scan Fraction      : 0.9091%
Estimated Scan Size (GB)    : 0.01 GB
Total Estimated IO Reduction: 99.09%

STEP 5: Estimated Cloud Cost Impact
Estimated Cost (CSV full scan)        : $0.0185
Estimated Cost (Parquet full scan)    :