In [5]:
!pip install pyspark




In [7]:
# Step 1: Create required CSV files (Week 1 data)
import pandas as pd

# Create Dataset folder
import os
os.makedirs("WEEK_3/Dataset", exist_ok=True)
os.makedirs("WEEK_3/Deliverables", exist_ok=True)

# customers.csv
customers = pd.DataFrame({
    "customer_id": [1, 2, 3, 4, 5],
    "customer_name": ["Asha Patel", "Rohan Sharma", "Neha Reddy", "Arjun Mehta", "Isha Kapoor"],
    "email": [
        "asha.patel@example.com",
        "rohan.sharma@example.com",
        "neha.reddy@example.com",
        "arjun.mehta@example.com",
        "isha.kapoor@example.com"
    ],
    "region": ["South", "North", "West", "East", "South"]
})
customers.to_csv("WEEK_3/Dataset/customers.csv", index=False)

# orders.csv
orders = pd.DataFrame({
    "order_id": [1, 2, 3, 4, 5],
    "customer_id": [1, 2, 3, 4, 5],
    "order_date": ["2025-07-10", "2025-07-11", "2025-07-12", "2025-07-13", "2025-07-14"],
    "delivery_date": ["2025-07-15", "2025-07-20", "2025-07-17", "", "2025-07-22"],
    "total_amount": [1500, 2300, 1800, 2100, 2500]
})
orders.to_csv("WEEK_3/Dataset/orders.csv", index=False)

# delivery_status.csv
delivery = pd.DataFrame({
    "delivery_id": [201, 202, 203, 204, 205],
    "order_id": [1, 2, 3, 4, 5],
    "status": ["Delivered", "Delayed", "Delivered", "In Transit", "Delayed"],
    "updated_at": ["2025-07-15", "2025-07-20", "2025-07-17", "2025-07-18", "2025-07-22"],
    "carrier": ["BlueDart", "Delhivery", "FedEx", "Ecom Express", "XpressBees"],
    "remarks": [
        "Delivered on time",
        "Delayed due to rain",
        "Delivered successfully",
        "Still in transit",
        "Customer not available"
    ]
})
delivery.to_csv("WEEK_3/Dataset/delivery_status.csv", index=False)

print("All Week 1 CSVs created under WEEK_3/Dataset")

# Step 2: PySpark Setup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

spark = SparkSession.builder.appName("Customer_Order_Analysis").getOrCreate()

# Step 3: Load CSVs using Spark
customers_df = spark.read.option("header", True).csv("WEEK_3/Dataset/customers.csv")
orders_df = spark.read.option("header", True).csv("WEEK_3/Dataset/orders.csv")
delivery_status_df = spark.read.option("header", True).csv("WEEK_3/Dataset/delivery_status.csv")

# Step 4: Convert types and calculate delay
orders_df = orders_df.withColumn("order_id", col("order_id").cast("int")) \
                     .withColumn("customer_id", col("customer_id").cast("int")) \
                     .withColumn("order_date", col("order_date").cast("timestamp")) \
                     .withColumn("delivery_date", col("delivery_date").cast("timestamp")) \
                     .withColumn("delay_days",
                                 when(col("delivery_date").isNotNull(),
                                      (col("delivery_date").cast("long") - col("order_date").cast("long")) / 86400
                                 ).otherwise(0)) \
                     .withColumn("is_delayed", when(col("delay_days") > 3, 1).otherwise(0))

customers_df = customers_df.withColumn("customer_id", col("customer_id").cast("int"))

# Step 5: Join customers and orders
joined_df = orders_df.join(customers_df, on="customer_id", how="inner")

# Step 6: Group by region and count delayed orders
delay_summary = joined_df.groupBy("region").sum("is_delayed") \
                         .withColumnRenamed("sum(is_delayed)", "delayed_orders")

# Step 7: Save final result to Deliverables folder
delay_summary.coalesce(1).write.mode("overwrite").option("header", True).csv("WEEK_3/Deliverables/delayed_orders_by_region")

print("\nWeek 3 output saved as WEEK_3/Deliverables/delayed_orders_by_region.csv")
delay_summary.show()


All Week 1 CSVs created under WEEK_3/Dataset

Week 3 output saved as WEEK_3/Deliverables/delayed_orders_by_region.csv
+------+--------------+
|region|delayed_orders|
+------+--------------+
| South|             2|
|  East|             0|
|  West|             1|
| North|             1|
+------+--------------+



In [None]:
import shutil
import glob

# Get the actual CSV file path created by Spark
output_folder = "WEEK_3/Deliverables/delayed_orders_by_region"
part_file = glob.glob(f"{output_folder}/part-*.csv")[0]  # first part file
final_csv = "WEEK_3/Deliverables/delayed_orders_by_region.csv"

# Move and rename
shutil.move(part_file, final_csv)
print(f" Final CSV ready: {final_csv}")



In [9]:
from google.colab import files
files.download(final_csv)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>