In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, to_date

spark = SparkSession.builder.appName("SupplyChainProcessing").getOrCreate()

# Load order data from CSV
suppliers_df = spark.read.option("header", True).csv("suppliers.csv", inferSchema=True)
inventory_df = spark.read.option("header", True).csv("inventory.csv", inferSchema=True)
orders_df_raw = spark.read.option("header", True).csv("orders.csv", inferSchema=True)

print("=== Suppliers Schema ===")
suppliers_df.printSchema()

print("=== Inventory Schema ===")
inventory_df.printSchema()

print("=== Orders Schema ===")
orders_df_raw.printSchema()

orders_df = orders_df_raw \
    .withColumn("order_date", to_date("order_date", "yyyy-MM-dd")) \
    .withColumn("delivery_date", to_date("delivery_date", "yyyy-MM-dd"))

# Filter delayed shipments
delayed_df = orders_df.filter(col("status") != "Delivered")

inventory_trimmed = inventory_df.select("product_id", "supplier_id")

# Join delayed orders
orders_with_supplier = delayed_df.join(inventory_trimmed, on="product_id", how="inner")

# Join with suppliers
joined_df = orders_with_supplier.join(suppliers_df, on="supplier_id", how="inner")

# Group by supplier and count delayed orders
result_df = joined_df.groupBy("supplier_id", "name") \
    .agg(count("order_id").alias("delayed_orders"))

# Show and save result
result_df.show()
result_df.coalesce(1).write.mode("overwrite").csv("output/delayed_orders_by_supplier_csv", header=True)
result_df.coalesce(1).write.mode("overwrite").parquet("output/delayed_orders_by_supplier_parquet")

spark.stop()


=== Suppliers Schema ===
root
 |-- supplier_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- contact_email: string (nullable = true)
 |-- phone: string (nullable = true)

=== Inventory Schema ===
root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- supplier_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- reorder_level: integer (nullable = true)

=== Orders Schema ===
root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- delivery_date: date (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- status: string (nullable = true)

+-----------+-------------------+--------------+
|supplier_id|               name|delayed_orders|
+-----------+-------------------+--------------+
|         10|   Kappa Supply Co.|             1|
|          8|Theta Manufacturing|             1|
|          4| Delta Distributors|      