In [3]:
# Step 1: Create Dataset files based on Week 1 schema
import pandas as pd
import os

# Create folders
os.makedirs("SupplyChain_Week3/Dataset", exist_ok=True)
os.makedirs("SupplyChain_Week3/Deliverables", exist_ok=True)

# orders.csv
orders_df = pd.DataFrame({
    "order_id": [101, 102, 103, 104],
    "product_id": [1, 2, 3, 4],
    "quantity": [5, 10, 2, 8],
    "order_date": ["2025-07-10", "2025-07-11", "2025-07-12", "2025-07-13"]
})
orders_df.to_csv("SupplyChain_Week3/Dataset/orders.csv", index=False)

# suppliers.csv
suppliers_df = pd.DataFrame({
    "supplier_id": [201, 202, 203, 204],
    "supplier_name": ["TechZone", "GearHub", "CoreSupply", "PrimeParts"],
    "contact_email": [
        "support@techzone.com",
        "hello@gearhub.com",
        None,
        "prime@parts.com"
    ]
})
suppliers_df.to_csv("SupplyChain_Week3/Dataset/suppliers.csv", index=False)

# inventory.csv
inventory_df = pd.DataFrame({
    "product_id": [1, 2, 3, 4],
    "product_name": ["Laptop", "Monitor", "Keyboard", "Mouse"],
    "stock_level": [30, 50, None, 40],
    "restock_date": ["2025-07-15", "2025-07-16", "", "2025-07-18"]
})
inventory_df.to_csv("SupplyChain_Week3/Dataset/inventory.csv", index=False)

print(" All 3 dataset CSVs created.")

# Step 2: PySpark setup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit

spark = SparkSession.builder.appName("SupplyChain_Week3_Cleanup").getOrCreate()

# Step 3: Load the CSVs
orders = spark.read.option("header", True).csv("SupplyChain_Week3/Dataset/orders.csv")
suppliers = spark.read.option("header", True).csv("SupplyChain_Week3/Dataset/suppliers.csv")
inventory = spark.read.option("header", True).csv("SupplyChain_Week3/Dataset/inventory.csv")

# Step 4: Clean orders.csv
orders = orders.withColumn("order_id", col("order_id").cast("int")) \
               .withColumn("product_id", col("product_id").cast("int")) \
               .withColumn("quantity", col("quantity").cast("int")) \
               .withColumn("order_date", col("order_date").cast("date"))

# Step 5: Clean suppliers.csv
suppliers = suppliers.fillna({"contact_email": "noemail@supplier.com"})

# Step 6: Clean inventory.csv
inventory = inventory.withColumn("product_id", col("product_id").cast("int")) \
                     .withColumn("stock_level", col("stock_level").cast("int")) \
                     .withColumn("restock_date",
                                 when(col("restock_date") == "", None)
                                 .otherwise(col("restock_date"))) \
                     .withColumn("restock_date", col("restock_date").cast("date")) \
                     .fillna({"stock_level": 0})

# Step 7: Save cleaned CSVs
orders.coalesce(1).write.mode("overwrite").option("header", True).csv("SupplyChain_Week3/Deliverables/cleaned_orders.csv")
suppliers.coalesce(1).write.mode("overwrite").option("header", True).csv("SupplyChain_Week3/Deliverables/cleaned_suppliers.csv")
inventory.coalesce(1).write.mode("overwrite").option("header", True).csv("SupplyChain_Week3/Deliverables/cleaned_inventory.csv")

# Step 8: Display outputs
print("\nCleaned Orders:")
orders.show()

print("\nCleaned Suppliers:")
suppliers.show()

print("\nCleaned Inventory:")
inventory.show()


 All 3 dataset CSVs created.

Cleaned Orders:
+--------+----------+--------+----------+
|order_id|product_id|quantity|order_date|
+--------+----------+--------+----------+
|     101|         1|       5|2025-07-10|
|     102|         2|      10|2025-07-11|
|     103|         3|       2|2025-07-12|
|     104|         4|       8|2025-07-13|
+--------+----------+--------+----------+


Cleaned Suppliers:
+-----------+-------------+--------------------+
|supplier_id|supplier_name|       contact_email|
+-----------+-------------+--------------------+
|        201|     TechZone|support@techzone.com|
|        202|      GearHub|   hello@gearhub.com|
|        203|   CoreSupply|noemail@supplier.com|
|        204|   PrimeParts|     prime@parts.com|
+-----------+-------------+--------------------+


Cleaned Inventory:
+----------+------------+-----------+------------+
|product_id|product_name|stock_level|restock_date|
+----------+------------+-----------+------------+
|         1|      Laptop|      