In [12]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Order Delay Analysis by Region").getOrCreate()

spark

In [13]:
from google.colab import drive
drive.mount('/content/drive')

orders_df = spark.read.option("header", True).csv("/content/drive/MyDrive/orders.csv", inferSchema=True)
customers_df = spark.read.option("header", True).csv("/content/drive/MyDrive/customers.csv", inferSchema=True)
status_df = spark.read.option("header", True).csv("/content/drive/MyDrive/delivery_status.csv", inferSchema=True)

orders_df.show()
customers_df.show()
status_df.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+--------+-----------+----------+-------------+
|order_id|customer_id|order_date|delivery_date|
+--------+-----------+----------+-------------+
|     101|          1|2024-06-01|   2024-06-05|
|     102|          2|2024-06-02|   2024-06-06|
|     103|          3|2024-06-03|         NULL|
|     104|          4|2024-06-04|   2024-06-08|
|     105|          5|2024-06-05|         NULL|
|     106|          6|2024-06-06|   2024-06-10|
|     107|          7|2024-06-07|   2024-06-11|
|     108|          8|2024-06-08|         NULL|
|     109|          9|2024-06-09|   2024-06-13|
|     110|         10|2024-06-10|         NULL|
|     111|         11|2024-06-11|   2024-06-15|
|     112|         12|2024-06-12|   2024-06-16|
|     113|         13|2024-06-13|         NULL|
|     114|         14|2024-06-14|   2024-06-18|
|     115|         15|2024-06-15|   2024-06-19|
|     1

In [14]:
orders_joined = orders_df.join(customers_df, on="customer_id", how="left") \
    .join(status_df.select("order_id", "status"), on="order_id", how="left")

In [15]:
from pyspark.sql.functions import col, when, current_date

orders_with_delay = orders_joined.withColumn(
    "delayed",
    when((col("status") != "Delivered") & (col("delivery_date").isNotNull()) & (col("delivery_date") < current_date()), 1).otherwise(0)
)
orders_df.show()

+--------+-----------+----------+-------------+
|order_id|customer_id|order_date|delivery_date|
+--------+-----------+----------+-------------+
|     101|          1|2024-06-01|   2024-06-05|
|     102|          2|2024-06-02|   2024-06-06|
|     103|          3|2024-06-03|         NULL|
|     104|          4|2024-06-04|   2024-06-08|
|     105|          5|2024-06-05|         NULL|
|     106|          6|2024-06-06|   2024-06-10|
|     107|          7|2024-06-07|   2024-06-11|
|     108|          8|2024-06-08|         NULL|
|     109|          9|2024-06-09|   2024-06-13|
|     110|         10|2024-06-10|         NULL|
|     111|         11|2024-06-11|   2024-06-15|
|     112|         12|2024-06-12|   2024-06-16|
|     113|         13|2024-06-13|         NULL|
|     114|         14|2024-06-14|   2024-06-18|
|     115|         15|2024-06-15|   2024-06-19|
|     116|         16|2024-06-16|         NULL|
|     117|         17|2024-06-17|   2024-06-21|
|     118|         18|2024-06-18|       

In [16]:
from pyspark.sql.functions import sum,col

delay_by_region = orders_with_delay.groupBy("region").sum("delayed").withColumnRenamed("sum(delayed)", "delayed_order_count") \
    .orderBy(col("delayed_order_count").desc())

delay_by_region.show()

+------+-------------------+
|region|delayed_order_count|
+------+-------------------+
|  East|                  2|
| North|                  1|
| South|                  0|
|  West|                  0|
+------+-------------------+



In [17]:
delay_by_region.coalesce(1).write.mode("overwrite").option("header", True).csv("/content/drive/MyDrive/delayed_orders_by_region")