In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Customer Order Insights") \
    .getOrCreate()


 Load cleaned order data into Databricks

In [0]:
orders_df = spark.read.option("header", True) \
                      .option("sep", ",") \
                      .option("inferSchema", True) \
                      .csv("file:///Workspace/Shared/orders.csv")


In [0]:
customers_df = spark.read.option("header", True) \
                         .option("sep", "\t") \
                         .option("inferSchema", True) \
                         .csv("file:///Workspace/Shared/customers.csv")


In [0]:
delivery_df = spark.read.option("header", True) \
                        .option("sep", ",") \
                        .option("inferSchema", True) \
                        .csv("file:///Workspace/Shared/delivery_status.csv")


In [0]:
from pyspark.sql.functions import trim, col

# Trim column names and values
orders_df = orders_df.select([trim(col(c)).alias(c.strip()) for c in orders_df.columns])
customers_df = customers_df.select([trim(col(c)).alias(c.strip()) for c in customers_df.columns])
delivery_df = delivery_df.select([trim(col(c)).alias(c.strip()) for c in delivery_df.columns])


In [0]:
from pyspark.sql.functions import to_date

# Orders
orders_df = orders_df.withColumn("order_id", col("order_id").cast("int")) \
                     .withColumn("customer_id", col("customer_id").cast("int")) \
                     .withColumn("order_date", to_date(col("order_date"), "dd-MM-yyyy")) \
                     .withColumn("delivery_date", to_date(col("delivery_date"), "dd-MM-yyyy"))

# Customers
customers_df = customers_df.withColumn("customer_id", col("customer_id").cast("int"))

# Delivery
delivery_df = delivery_df.withColumn("order_id", col("order_id").cast("int")) \
                         .withColumn("status_id", col("status_id").cast("int"))


In [0]:
orders_df.printSchema()
customers_df.printSchema()
delivery_df.printSchema()

orders_df.show(5)
customers_df.show(5)
delivery_df.show(5)


root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- delivery_date: date (nullable = true)

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)

root
 |-- status_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- issue: string (nullable = true)

+--------+-----------+----------+-------------+
|order_id|customer_id|order_date|delivery_date|
+--------+-----------+----------+-------------+
|    NULL|       NULL|      NULL|         NULL|
|    NULL|       NULL|      NULL|         NULL|
|    NULL|       NULL|      NULL|         NULL|
|    NULL|       NULL|      NULL|         NULL|
|    NULL|       NULL|      NULL|         NULL|
+--------+-----------+----------+-------------+
only showing top 5 rows

+-----------+----------------+---------------

In [0]:
orders_customers_df = orders_df.join(customers_df, on="customer_id", how="inner")
full_df = orders_customers_df.join(delivery_df, on="order_id", how="left")


 Create a pipeline to update latest delivery status

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc

# Keep only latest delivery status per order_id
window_spec = Window.partitionBy("order_id").orderBy(desc("status_id"))
latest_status_df = delivery_df.withColumn("rank", row_number().over(window_spec)) \
    .filter("rank = 1").drop("rank")

# Join again to get only latest status
final_df = orders_customers_df.join(latest_status_df, on="order_id", how="left")



Save the results as Delta or CSV

In [0]:
final_df.write.format("delta").mode("overwrite").save("/delta/orders_with_status")


In [0]:
final_df.write.option("header", True).mode("overwrite").csv("dbfs:/FileStore/output/orders_with_status.csv")


 run a SQL query to show top 5 delayed customers

In [0]:
final_df.createOrReplaceTempView("orders_status_view")

In [0]:
%sql
SELECT customer_id, name, COUNT(*) AS delayed_orders
FROM orders_status_view
WHERE status = 'Delayed'
GROUP BY customer_id, name
ORDER BY delayed_orders DESC
LIMIT 5;
