In [0]:
# ============================================================
# 01_Data_Load_and_Conversion
# Purpose: Load 10M CSV → Convert to Partitioned Parquet
#          → Generate 100M Partitioned Parquet
# ============================================================

from pyspark.sql.functions import col, to_date, year, month
import time

# ── Paths ────────────────────────────────────────────────────
CSV_PATH         = "/Volumes/workspace/default/raw_data/ecommerce_10M_55cols.csv"
PARQUET_10M_PATH = "/Volumes/workspace/default/raw_data/ecommerce_parquet"
PARQUET_100M_PATH= "/Volumes/workspace/default/raw_data/ecommerce_100M_parquet"

# ── Step 1: Load CSV ─────────────────────────────────────────
print("=" * 60)
print("STEP 1: Loading 10M CSV")
print("=" * 60)

df_csv = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(CSV_PATH)

print(f"Schema:")
df_csv.printSchema()
print(f"Total Rows (10M CSV): {df_csv.count():,}")

# ── Step 2: Add Partition Columns ────────────────────────────
print("\nSTEP 2: Adding year/month partition columns...")

df_partitioned = df_csv \
    .withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd")) \
    .withColumn("year",  year(col("order_date"))) \
    .withColumn("month", month(col("order_date")))

# ── Step 3: Write 10M Partitioned Parquet ───────────────────
print("\nSTEP 3: Writing 10M Partitioned Parquet...")

start = time.time()

df_partitioned \
    .repartition(100) \
    .write \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .parquet(PARQUET_10M_PATH)

print(f"10M Parquet Write Time: {round(time.time() - start, 2)} seconds")
print(f"Saved to: {PARQUET_10M_PATH}")

# ── Step 4: Verify 10M Parquet ───────────────────────────────
df_pp_10M = spark.read.parquet(PARQUET_10M_PATH)
print(f"\nVerification - 10M Parquet Row Count: {df_pp_10M.count():,}")

# ── Step 5: Generate 100M via CrossJoin ─────────────────────
print("\nSTEP 4: Generating 100M dataset via crossJoin replication...")

df_100M = df_pp_10M \
    .crossJoin(spark.range(0, 10).toDF("replica")) \
    .drop("replica") \
    .repartition(400)

print(f"100M Row Count: {df_100M.count():,}")

# ── Step 6: Write 100M Partitioned Parquet ──────────────────
print("\nSTEP 5: Writing 100M Partitioned Parquet...")

start = time.time()

df_100M.write \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .parquet(PARQUET_100M_PATH)

print(f"100M Parquet Write Time: {round(time.time() - start, 2)} seconds")
print(f"Saved to: {PARQUET_100M_PATH}")

# ── Step 7: Final Verification ───────────────────────────────
df_pp_100M = spark.read.parquet(PARQUET_100M_PATH)
print(f"\nVerification - 100M Parquet Row Count: {df_pp_100M.count():,}")
print("\n✅ Data Load and Conversion Complete!")

STEP 1: Loading 10M CSV
Schema:
root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- phone: long (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- pincode: integer (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_channel: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price_per_unit: double (nullable = true)
 |-- total_price: double (nullable = true)
 |-- discount: double (nullable = true)
 |-- tax: double (nu