In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException




### Spark Session

In [0]:
spark = SparkSession.builder.appName("RetailOrders").getOrCreate()

### Loading dataset

In [0]:
# Load Customers
customers_first = spark.read.parquet("/FileStore/retail-data-pipeline/dataset/customers_first.parquet")
customers_second = spark.read.parquet("/FileStore/retail-data-pipeline/dataset/customers_second.parquet")
# Load Orders
orders_first = spark.read.parquet("/FileStore/retail-data-pipeline/dataset/orders_first.parquet")
orders_second = spark.read.parquet("/FileStore/retail-data-pipeline/dataset/orders_second.parquet")

# Load Products
products_first = spark.read.parquet("/FileStore/retail-data-pipeline/dataset/products_first.parquet")
products_second = spark.read.parquet("/FileStore/retail-data-pipeline/dataset/products_second.parquet")

# Load Regions
regions = spark.read.parquet("/FileStore/retail-data-pipeline/dataset/regions.parquet")

### Exception Handling to loaded Dataset


In [0]:
def safe_read_parquet(path):
    try:
        return spark.read.parquet(path)
    except AnalysisException:
        print(f"File not found: {path}")
        return None

customers_first = safe_read_parquet("/FileStore/retail-data-pipeline/dataset/customers_first.parquet")
customers_second = safe_read_parquet("/FileStore/retail_order/dataset/customers_second.parquet")
orders_first = safe_read_parquet("/FileStore/retail-data-pipeline/dataset/orders_first.parquet")
orders_second = safe_read_parquet("/FileStore/retail-data-pipeline/dataset/orders_second.parquet")
products_first = safe_read_parquet("/FileStore/retail-data-pipeline/dataset/products_first.parquet")
products_second = safe_read_parquet("/FileStore/retail-data-pipeline/dataset/products_second.parquet")
regions = safe_read_parquet("/FileStore/retail-data-pipeline/dataset/regions.parquet")

### Displaying Data

In [0]:
display(customers_first)
display(orders_first)
display(products_first)
display(regions)

### LookUp Layer

In [0]:

lookup_customers = customers_first.unionByName(customers_second)
lookup_orders = orders_first.unionByName(orders_second)
lookup_products = products_first.unionByName(products_second)
lookup_regions = regions

lookup_customers.write.mode("overwrite").saveAsTable("lookup_customers")
lookup_orders.write.mode("overwrite").saveAsTable("lookup_orders")
lookup_products.write.mode("overwrite").saveAsTable("lookup_products")
lookup_regions.write.mode("overwrite").saveAsTable("lookup_regions")


###Bronze Layer

In [0]:

bronze_customers = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "parquet")
    .load("/FileStore/retail_order/dataset/customers_first.parquet")
)
bronze_customers.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/tmp/bronze_customers_checkpoint") \
    .outputMode("append") \
    .table("bronze_customers")


###Silver Layer

In [0]:

silver_customers = spark.sql("""
  SELECT DISTINCT * FROM bronze_customers WHERE customer_id IS NOT NULL
""")
silver_orders = spark.sql("""
  SELECT o.*, c.customer_name, r.region_name
  FROM bronze_orders o
  JOIN bronze_customers c ON o.customer_id = c.customer_id
  JOIN bronze_regions r ON o.region_id = r.region_id
""")
silver_products = spark.sql("""
  SELECT DISTINCT * FROM bronze_products WHERE product_id IS NOT NULL
""")

silver_customers.write.mode("overwrite").saveAsTable("silver_customers")
silver_orders.write.mode("overwrite").saveAsTable("silver_orders")
silver_products.write.mode("overwrite").saveAsTable("silver_products")

### Gold layer

In [0]:

gold_sales_summary = spark.sql("""
  SELECT
    p.product_category,
    r.region_name,
    SUM(o.order_amount) AS total_sales,
    COUNT(DISTINCT o.order_id) AS total_orders
  FROM silver_orders o
  JOIN silver_products p ON o.product_id = p.product_id
  JOIN silver_customers c ON o.customer_id = c.customer_id
  JOIN bronze_regions r ON o.region_id = r.region_id
  GROUP BY p.product_category, r.region_name
""")

gold_sales_summary.write.mode("overwrite").saveAsTable("gold_sales_summary")


### Displaying Data

In [0]:
display(spark.table("gold_sales_summary"))