In [1]:
# Step 1: Create Dataset and Deliverables folders
import pandas as pd
import os

os.makedirs("SupplyChain_Week4/Dataset", exist_ok=True)
os.makedirs("SupplyChain_Week4/Deliverables", exist_ok=True)

# Step 2: Generate dataset files (Week 1 base data + extended for Week 4)

# orders.csv
orders_df = pd.DataFrame({
    "order_id": [1, 2, 3, 4, 5],
    "customer_id": [1, 2, 3, 4, 5],
    "order_date": ["2025-07-10", "2025-07-11", "2025-07-12", "2025-07-13", "2025-07-14"],
    "delivery_date": ["2025-07-15", "2025-07-20", "2025-07-17", "", "2025-07-22"],
    "total_amount": [1500, 2300, 1800, 2100, 2500],
    "supplier_id": [101, 102, 103, 104, 105]
})
orders_df.to_csv("SupplyChain_Week4/Dataset/orders.csv", index=False)

# suppliers.csv
suppliers_df = pd.DataFrame({
    "supplier_id": [101, 102, 103, 104, 105],
    "supplier_name": ["Alpha Supplies", "Beta Traders", "Gamma Exports", "Delta Goods", "Epsilon Ltd"],
    "contact_email": [
        "alpha@supplies.com",
        "beta@traders.com",
        "gamma@exports.com",
        "delta@goods.com",
        "epsilon@ltd.com"
    ],
    "region": ["North", "South", "East", "West", "North"]
})
suppliers_df.to_csv("SupplyChain_Week4/Dataset/suppliers.csv", index=False)

# inventory.csv
inventory_df = pd.DataFrame({
    "inventory_id": [501, 502, 503, 504, 505],
    "product_name": ["Laptops", "Mobiles", "Tablets", "Printers", "Accessories"],
    "stock_quantity": [50, 100, 60, 40, 150],
    "supplier_id": [101, 102, 103, 104, 105],
    "last_restock_date": ["2025-07-05", "2025-07-07", "2025-07-09", "2025-07-11", "2025-07-13"]
})
inventory_df.to_csv("SupplyChain_Week4/Dataset/inventory.csv", index=False)

print("Week 4 dataset files created.")

# Step 3: Initialize PySpark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, current_date, when, datediff

spark = SparkSession.builder.appName("SupplyChain_Week4").getOrCreate()

# Step 4: Load CSV files
orders_df = spark.read.option("header", True).csv("SupplyChain_Week4/Dataset/orders.csv")
suppliers_df = spark.read.option("header", True).csv("SupplyChain_Week4/Dataset/suppliers.csv")
inventory_df = spark.read.option("header", True).csv("SupplyChain_Week4/Dataset/inventory.csv")

# Step 5: Type casting and date formatting
orders_df = orders_df.withColumn("order_id", col("order_id").cast("int")) \
                     .withColumn("customer_id", col("customer_id").cast("int")) \
                     .withColumn("total_amount", col("total_amount").cast("int")) \
                     .withColumn("supplier_id", col("supplier_id").cast("int")) \
                     .withColumn("order_date", to_date("order_date")) \
                     .withColumn("delivery_date", to_date("delivery_date"))

# Fill missing delivery_date with today
orders_df = orders_df.withColumn(
    "delivery_date",
    when(col("delivery_date").isNull(), current_date()).otherwise(col("delivery_date"))
)

# Add delay_days and is_delayed
orders_df = orders_df.withColumn("delay_days", datediff(col("delivery_date"), col("order_date"))) \
                     .withColumn("is_delayed", (col("delay_days") > 3).cast("int"))

suppliers_df = suppliers_df.withColumn("supplier_id", col("supplier_id").cast("int"))

inventory_df = inventory_df.withColumn("inventory_id", col("inventory_id").cast("int")) \
                           .withColumn("supplier_id", col("supplier_id").cast("int")) \
                           .withColumn("stock_quantity", col("stock_quantity").cast("int")) \
                           .withColumn("last_restock_date", to_date("last_restock_date"))

# Step 6: Save cleaned datasets
orders_df.coalesce(1).write.mode("overwrite").option("header", True) \
        .csv("SupplyChain_Week4/Deliverables/cleaned_orders.csv")

suppliers_df.coalesce(1).write.mode("overwrite").option("header", True) \
        .csv("SupplyChain_Week4/Deliverables/cleaned_suppliers.csv")

inventory_df.coalesce(1).write.mode("overwrite").option("header", True) \
        .csv("SupplyChain_Week4/Deliverables/cleaned_inventory.csv")

print("Week 4 cleaned datasets created.")

# Step 7: Display sample outputs
print("Orders DataFrame:")
orders_df.show()

print("Suppliers DataFrame:")
suppliers_df.show()

print("Inventory DataFrame:")
inventory_df.show()



Week 4 dataset files created.
Week 4 cleaned datasets created.
Orders DataFrame:
+--------+-----------+----------+-------------+------------+-----------+----------+----------+
|order_id|customer_id|order_date|delivery_date|total_amount|supplier_id|delay_days|is_delayed|
+--------+-----------+----------+-------------+------------+-----------+----------+----------+
|       1|          1|2025-07-10|   2025-07-15|        1500|        101|         5|         1|
|       2|          2|2025-07-11|   2025-07-20|        2300|        102|         9|         1|
|       3|          3|2025-07-12|   2025-07-17|        1800|        103|         5|         1|
|       4|          4|2025-07-13|   2025-08-19|        2100|        104|        37|         1|
|       5|          5|2025-07-14|   2025-07-22|        2500|        105|         8|         1|
+--------+-----------+----------+-------------+------------+-----------+----------+----------+

Suppliers DataFrame:
+-----------+--------------+--------------