In [None]:
## Example:

import pandas as pd

df = pd.read_parquet('../output/result.parquet')
df.head()


In [None]:
# 1️⃣ PySpark Session Setup
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Ecommerce ETL Demo") \
    .getOrCreate()


In [None]:
# 2️⃣ Load Raw Data
raw_orders = spark.read.csv("../data/raw/orders.csv", header=True, inferSchema=True)
raw_customers = spark.read.csv("../data/raw/customers.csv", header=True, inferSchema=True)
raw_products = spark.read.csv("../data/raw/products.csv", header=True, inferSchema=True)



In [None]:
# 3️⃣ Basic Data Checks
raw_orders.printSchema()
raw_orders.show(5)


In [None]:
# 4️⃣ Transformations (example: filter valid orders & join with products)
from pyspark.sql.functions import col

valid_orders = raw_orders.filter(col("order_status") == "Completed")

orders_with_products = valid_orders.join(
    raw_products,
    on="product_id",
    how="left"
)


In [None]:
# 5️⃣ Save Processed Output (overwrite mode for notebooks)
orders_with_products.write.mode("overwrite").parquet("../data/processed/orders_enriched/")


In [None]:
# 6️⃣ Simple Aggregation Summary
summary = orders_with_products.groupBy("product_category").count().orderBy("count", ascending=False)
summary.show()
