In [11]:
import glob, os
print("CSV files in workspace:")
for f in glob.glob("**/*.csv", recursive=True):
    print(" -", f)


CSV files in workspace:
 - sample_data/mnist_test.csv
 - sample_data/mnist_train_small.csv
 - sample_data/california_housing_train.csv
 - sample_data/california_housing_test.csv


In [12]:
# Replace these with exact matches from Step 1
customers_path = "customers.csv"
orders_path = "orders.csv"
delivery_path = "delivery_status.csv"


In [13]:
from google.colab import files

uploaded = files.upload()


Saving orders.csv to orders.csv


In [15]:
from google.colab import files

uploaded = files.upload()


Saving delivery_status.csv to delivery_status.csv


In [17]:
from google.colab import files

uploaded = files.upload()


Saving customers.csv to customers.csv


In [18]:
import glob
print("CSV files in workspace:")
for f in glob.glob("*.csv"):
    print("-", f)


CSV files in workspace:
- orders.csv
- customers.csv
- orders (1).csv
- delivery_status.csv


In [19]:
# Week 4 – Supply Chain Monitoring & Optimization
# Customer Order Insights (Exact Replica Format)

# Step 1: Install dependencies
!pip install pyspark==3.5.1 delta-spark==3.1.0 -q

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import col, to_date, current_date, datediff

import pandas as pd
import os

# Step 2: Create Spark session
builder = SparkSession.builder.appName("Week4_SupplyChain").config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark Session Created")

# Step 3: Load Week 1–3 datasets (from uploaded files)
customers_path = "customers.csv"
orders_path = "orders.csv"
delivery_path = "delivery_status.csv"

customers_df = pd.read_csv(customers_path)
orders_df = pd.read_csv(orders_path)
delivery_status_df = pd.read_csv(delivery_path)

print("Datasets loaded successfully")

# Step 4: Convert to Spark DataFrames
customers_sdf = spark.createDataFrame(customers_df)
orders_sdf = spark.createDataFrame(orders_df)
delivery_sdf = spark.createDataFrame(delivery_status_df)

# Step 5: Data Cleaning – Convert dates
orders_sdf = orders_sdf.withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd"))
orders_sdf = orders_sdf.withColumn("delivery_date", to_date(col("delivery_date"), "yyyy-MM-dd"))

# Fill missing delivery_date with today's date
orders_sdf = orders_sdf.fillna({"delivery_date": pd.Timestamp.today().strftime("%Y-%m-%d")})

# Step 6: Add delay column
orders_sdf = orders_sdf.withColumn("delay_days", datediff(col("delivery_date"), col("order_date")))

# Step 7: Join datasets
orders_with_status = (
    orders_sdf.join(customers_sdf, "customer_id", "left")
              .join(delivery_sdf, "order_id", "left")
)

# Step 8: Convert to Pandas for saving
final_df = orders_with_status.toPandas()

# Step 9: Save output as Deliverable
deliverables_dir = "WEEK_4/Deliverables"
os.makedirs(deliverables_dir, exist_ok=True)

output_path = os.path.join(deliverables_dir, "orders_with_status.csv")
final_df.to_csv(output_path, index=False)

print(f"Deliverable saved successfully: {output_path}")


Spark Session Created
Datasets loaded successfully
Deliverable saved successfully: WEEK_4/Deliverables/orders_with_status.csv
