In [None]:
# Databricks notebook source
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, to_date

# Create Spark session
spark = SparkSession.builder.appName("RetailSalesTransformation").getOrCreate()

# File paths (replace with your ADLS paths in real setup)
sales_path = "/Downloads/retail_sales.csv"
product_path = "/Downloads/data/product_master.csv"

# Read CSV files into DataFrames
df_sales = spark.read.csv(sales_path, header=True, inferSchema=True)
df_products = spark.read.csv(product_path, header=True, inferSchema=True)

# Convert date column to proper date type
df_sales = df_sales.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

# Join sales with product details
df_joined = df_sales.join(df_products, on="product_id", how="left")

# Calculate total sales per category
df_category_sales = df_joined.groupBy("category") \
    .agg(_sum("sales_amount").alias("total_sales")) \
    .orderBy(col("total_sales").desc())

# Show result
df_category_sales.show()

# Save transformed data back to ADLS (processed zone)
output_path = "/mnt/processed/retail_category_sales"
df_category_sales.write.mode("overwrite").parquet(output_path)

print("Transformation completed successfully!")
