In [0]:
from pyspark.sql.functions import *

# STEP 1: Load Data
sales_df = spark.read.option("header", True).csv("dbfs:/FileStore/sales.csv")
products_df = spark.read.option("header", True).csv("dbfs:/FileStore/products.csv")

# STEP 2: Cast numeric columns to correct types
sales_df = sales_df.withColumn("quantity", col("quantity").cast("int")) \
                   .withColumn("price", col("price").cast("int")) \
                   .withColumn("order_date", to_date("order_date"))

# STEP 3: Calculate total amount
sales_df = sales_df.withColumn("total_amount", col("quantity") * col("price"))

# STEP 4: Extract month name
sales_df = sales_df.withColumn("month", date_format("order_date", "MMMM"))

# STEP 5: Join with products
df = sales_df.join(products_df, on="product_id", how="left")

# STEP 6: Revenue by Month
revenue_by_month = df.groupBy("month").agg(round(sum("total_amount"), 2).alias("monthly_revenue"))

# STEP 7: Revenue by Category
revenue_by_category = df.groupBy("category").agg(round(sum("total_amount"), 2).alias("category_revenue"))

# STEP 8: Top 2 Products per Region
product_sales = df.groupBy("product_name", "region").agg(sum("total_amount").alias("total_sales"))

from pyspark.sql.window import Window
windowSpec = Window.partitionBy("region").orderBy(desc("total_sales"))

top_products = product_sales.withColumn("rank", row_number().over(windowSpec)) \
                            .filter(col("rank") <= 2)

# STEP 9: Save Cleaned Data as Delta Table
df.write.format("delta").mode("overwrite").save("dbfs:/mnt/data/sales_analytics_cleaned")
