In [2]:
# Step 1: Create Dataset and Deliverables folders
import pandas as pd
import os

os.makedirs("SupplyChain_Week3/Dataset", exist_ok=True)
os.makedirs("SupplyChain_Week3/Deliverables", exist_ok=True)

# Step 2: Generate Week 1 sample data

# customers.csv
customers_df = pd.DataFrame({
    "customer_id": [1, 2, 3, 4, 5],
    "customer_name": ["Asha Patel", "Rohan Sharma", "Neha Reddy", "Arjun Mehta", "Isha Kapoor"],
    "email": [
        "asha.patel@example.com",
        "rohan.sharma@example.com",
        "neha.reddy@example.com",
        "arjun.mehta@example.com",
        "isha.kapoor@example.com"
    ],
    "region": ["South", "North", "West", "East", "South"]
})
customers_df.to_csv("SupplyChain_Week3/Dataset/customers.csv", index=False)

# orders.csv
orders_df = pd.DataFrame({
    "order_id": [1, 2, 3, 4, 5],
    "customer_id": [1, 2, 3, 4, 5],
    "order_date": ["2025-07-10", "2025-07-11", "2025-07-12", "2025-07-13", "2025-07-14"],
    "delivery_date": ["2025-07-15", "2025-07-20", "2025-07-17", "", "2025-07-22"],
    "total_amount": [1500, 2300, 1800, 2100, 2500]
})
orders_df.to_csv("SupplyChain_Week3/Dataset/orders.csv", index=False)

# delivery_status.csv
delivery_status_df = pd.DataFrame({
    "delivery_id": [201, 202, 203, 204, 205],
    "order_id": [1, 2, 3, 4, 5],
    "status": ["Delivered", "Delayed", "Delivered", "In Transit", "Delayed"],
    "updated_at": ["2025-07-15", "2025-07-20", "2025-07-17", "2025-07-18", "2025-07-22"],
    "carrier": ["BlueDart", "Delhivery", "FedEx", "Ecom Express", "XpressBees"],
    "remarks": [
        "Delivered on time",
        "Delayed due to rain",
        "Delivered successfully",
        "Still in transit",
        "Customer not available"
    ]
})
delivery_status_df.to_csv("SupplyChain_Week3/Dataset/delivery_status.csv", index=False)

print("Dataset files created.")

# Step 3: Initialize PySpark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, datediff, current_date, when

spark = SparkSession.builder.appName("SupplyChain_Week3").getOrCreate()

# Step 4: Load CSVs
customers_df = spark.read.option("header", True).csv("SupplyChain_Week3/Dataset/customers.csv")
orders_df = spark.read.option("header", True).csv("SupplyChain_Week3/Dataset/orders.csv")
delivery_status_df = spark.read.option("header", True).csv("SupplyChain_Week3/Dataset/delivery_status.csv")

# Step 5: Type casting and date formatting
orders_df = orders_df.withColumn("order_id", col("order_id").cast("int")) \
                     .withColumn("customer_id", col("customer_id").cast("int")) \
                     .withColumn("total_amount", col("total_amount").cast("int")) \
                     .withColumn("order_date", to_date("order_date")) \
                     .withColumn("delivery_date", to_date("delivery_date"))

# Step 6: Fill missing delivery_date with today's date
orders_df = orders_df.withColumn(
    "delivery_date",
    when(col("delivery_date").isNull(), current_date()).otherwise(col("delivery_date"))
)

# Step 7: Calculate delay days and delayed flag
orders_df = orders_df.withColumn("delay_days", datediff(col("delivery_date"), col("order_date"))) \
                     .withColumn("is_delayed", (col("delay_days") > 3).cast("int"))

# Step 8: Join orders with customers to get region info
joined_df = orders_df.join(customers_df, on="customer_id", how="left")

# Step 9: Group by region to get delayed order count
delayed_by_region = joined_df.groupBy("region").sum("is_delayed") \
                             .withColumnRenamed("sum(is_delayed)", "delayed_orders") \
                             .orderBy("delayed_orders", ascending=False)

# Step 10: Save final result
delayed_by_region.coalesce(1).write.mode("overwrite").option("header", True) \
                  .csv("SupplyChain_Week3/Deliverables/delayed_orders_by_region.csv")

# Step 11: Show output
print("Delayed Orders by Region:")
delayed_by_region.show()


Dataset files created.
Delayed Orders by Region:
+------+--------------+
|region|delayed_orders|
+------+--------------+
| South|             2|
|  East|             1|
|  West|             1|
| North|             1|
+------+--------------+

