In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("Customer Order Insights - Customer Delay Analysis") \
    .getOrCreate()

# Load data
customers_df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("customers.csv")

orders_df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("orders.csv")

delivery_df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("delivery_status.csv")

orders_df = orders_df.withColumn("order_id", col("order_id").cast("int")) \
                     .withColumn("customer_id", col("customer_id").cast("int"))

delivery_df = delivery_df.withColumn("order_id", col("order_id").cast("int"))

# Join datasets
orders_with_status = orders_df.join(delivery_df, on="order_id", how="inner")
full_data = orders_with_status.join(customers_df, on="customer_id", how="inner")

# Filter delayed orders
delayed_orders = full_data.filter(col("status") == "Delayed")

# Group by customer name and count delayed orders
delayed_by_customer = delayed_orders.groupBy("name") \
    .count().withColumnRenamed("count", "delayed_order_count")

# Save result as CSV
delayed_by_customer.coalesce(1).write.mode("overwrite") \
    .option("header", True) \
    .csv("output/delayed_orders_by_customer")

delayed_by_customer.show()

spark.stop()


+-------------------+-------------------+
|               name|delayed_order_count|
+-------------------+-------------------+
|     Claire Johnson|                  1|
|     Marissa Jacobs|                  1|
|Christopher Morales|                  2|
|     Angelica Miles|                  1|
|     Samantha David|                  1|
|       Joseph Woods|                  1|
|      Anthony Moore|                  1|
|    Robert Anderson|                  1|
|  Caitlin Rodriguez|                  3|
|        Julie Hurst|                  1|
|    Sarah Carpenter|                  1|
| Catherine Crawford|                  1|
|  Kristen Mccormick|                  1|
|       Kelsey Miles|                  1|
|     Katelyn Nelson|                  1|
|  Dr. Tiffany Brady|                  1|
|     Jill Robertson|                  1|
|        Erika Lopez|                  1|
|     Benjamin Lewis|                  2|
|     Matthew Snyder|                  1|
+-------------------+-------------