In [0]:
# ============================================================
# 05_Price_Optimization_Analysis_With_Results_And_Timing
# ============================================================

from pyspark.sql.functions import (
    col, avg, sum as _sum, count,
    round as _round, when
)
import time

PARQUET_100M_PATH = "/Volumes/workspace/default/raw_data/ecommerce_100M_parquet"

print("=" * 70)
print("PRICE OPTIMIZATION ANALYSIS — 100M SALES RECORDS")
print("=" * 70)

global_start = time.time()

# ------------------------------------------------------------
# LOAD DATA
# ------------------------------------------------------------
start = time.time()
df = spark.read.parquet(PARQUET_100M_PATH)
df.count()  # trigger load
load_time = round(time.time() - start, 2)

print(f"\nDataset Loaded")
print(f"Load Time: {load_time} seconds")

# ============================================================
# 1️⃣ DISCOUNT IMPACT ANALYSIS
# ============================================================

print("\n" + "=" * 70)
print("1️⃣ DISCOUNT IMPACT ANALYSIS")
print("=" * 70)

start = time.time()

discount_analysis = df.groupBy("category") \
    .agg(
        _round(avg("discount"), 2).alias("avg_discount"),
        _round(_sum("final_price"), 2).alias("total_revenue"),
        _round(avg("final_price"), 2).alias("avg_final_price"),
        count("*").alias("transactions")
    ) \
    .orderBy(col("total_revenue").desc())

discount_analysis.show(10, False)
t1 = round(time.time() - start, 2)

print(f"Execution Time: {t1} seconds")

# ============================================================
# 2️⃣ ELASTICITY ANALYSIS
# ============================================================

print("\n" + "=" * 70)
print("2️⃣ ELASTICITY ANALYSIS (DISCOUNT VS QUANTITY)")
print("=" * 70)

start = time.time()

elasticity = df.groupBy("category") \
    .agg(
        _round(avg("discount"), 2).alias("avg_discount"),
        _round(avg("quantity"), 2).alias("avg_quantity"),
        _round(_sum("final_price"), 2).alias("total_revenue")
    ) \
    .orderBy(col("avg_discount").desc())

elasticity.show(10, False)
t2 = round(time.time() - start, 2)

print(f"Execution Time: {t2} seconds")

# ============================================================
# 3️⃣ CITY-LEVEL ANALYSIS
# ============================================================

print("\n" + "=" * 70)
print("3️⃣ CITY-LEVEL REVENUE ANALYSIS")
print("=" * 70)

start = time.time()

city_analysis = df.groupBy("city") \
    .agg(
        _round(_sum("final_price"), 2).alias("total_revenue"),
        _round(avg("discount"), 2).alias("avg_discount"),
        count("*").alias("transactions")
    ) \
    .orderBy(col("total_revenue").desc())

city_analysis.show(10, False)
t3 = round(time.time() - start, 2)

print(f"Execution Time: {t3} seconds")

# ============================================================
# 4️⃣ PRIME SEGMENTATION
# ============================================================

print("\n" + "=" * 70)
print("4️⃣ PRIME VS NON-PRIME ANALYSIS")
print("=" * 70)

start = time.time()

prime_analysis = df.groupBy("is_prime_user") \
    .agg(
        _round(_sum("final_price"), 2).alias("total_revenue"),
        _round(avg("discount"), 2).alias("avg_discount"),
        _round(avg("quantity"), 2).alias("avg_quantity"),
        count("*").alias("transactions")
    )

prime_analysis.show(10, False)
t4 = round(time.time() - start, 2)

print(f"Execution Time: {t4} seconds")

# ============================================================
# 5️⃣ UNDERPRICED PRODUCT DETECTION
# ============================================================

print("\n" + "=" * 70)
print("5️⃣ UNDERPRICED PRODUCT DETECTION")
print("=" * 70)

start = time.time()

product_analysis = df.groupBy("product_id") \
    .agg(
        _round(_sum("quantity"), 2).alias("total_quantity"),
        _round(_sum("final_price"), 2).alias("total_revenue")
    ) \
    .orderBy(col("total_quantity").desc())

product_analysis.show(10, False)
t5 = round(time.time() - start, 2)

print(f"Execution Time: {t5} seconds")

# ============================================================
# 6️⃣ PROFIT LEAKAGE ANALYSIS
# ============================================================

print("\n" + "=" * 70)
print("6️⃣ PROFIT LEAKAGE ANALYSIS")
print("=" * 70)

start = time.time()

df_ratio = df.withColumn(
    "discount_ratio",
    when(col("total_price") > 0,
         col("discount") / col("total_price")
    ).otherwise(0)
)

leakage_analysis = df_ratio.groupBy("category") \
    .agg(
        _round(avg("discount_ratio"), 3).alias("avg_discount_ratio"),
        _round(avg("discount"), 2).alias("avg_discount")
    ) \
    .orderBy(col("avg_discount_ratio").desc())

leakage_analysis.show(10, False)
t6 = round(time.time() - start, 2)

print(f"Execution Time: {t6} seconds")

# ============================================================
# FINAL TIMING SUMMARY
# ============================================================

total_time = round(time.time() - global_start, 2)

print("\n" + "=" * 70)
print("ANALYSIS TIMING SUMMARY")
print("=" * 70)
print(f"{'Load':<35} {load_time:>10} sec")
print(f"{'Discount Impact':<35} {t1:>10} sec")
print(f"{'Elasticity':<35} {t2:>10} sec")
print(f"{'City Analysis':<35} {t3:>10} sec")
print(f"{'Prime Segmentation':<35} {t4:>10} sec")
print(f"{'Underpricing':<35} {t5:>10} sec")
print(f"{'Profit Leakage':<35} {t6:>10} sec")
print("-" * 55)
print(f"{'TOTAL ANALYSIS TIME':<35} {total_time:>10} sec")
print("=" * 70)

print("\n✅ Price Optimization Analysis Complete!")

PRICE OPTIMIZATION ANALYSIS — 100M SALES RECORDS

Dataset Loaded
Load Time: 11.4 seconds

1️⃣ DISCOUNT IMPACT ANALYSIS
+-----------+------------+-----------------+---------------+------------+
|category   |avg_discount|total_revenue    |avg_final_price|transactions|
+-----------+------------+-----------------+---------------+------------+
|Beauty     |1146.95     |1.378664610197E11|6884.56        |20025460    |
|Appliances |1147.86     |1.3777045363E11  |6884.08        |20012920    |
|Grocery    |1148.94     |1.376898883477E11|6886.11        |19995310    |
|Fashion    |1148.68     |1.376540048975E11|6888.55        |19983020    |
|Electronics|1147.6      |1.376042728981E11|6885.97        |19983290    |
+-----------+------------+-----------------+---------------+------------+

Execution Time: 5.81 seconds

2️⃣ ELASTICITY ANALYSIS (DISCOUNT VS QUANTITY)
+-----------+------------+------------+-----------------+
|category   |avg_discount|avg_quantity|total_revenue    |
+-----------+--------