In [15]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np

# Step 2: Load the datasets
customers_df = pd.read_csv("customers.csv")
orders_df = pd.read_csv("orders (1).csv")  # or "orders.csv" if renamed
delivery_status_df = pd.read_csv("delivery_status.csv")

# Step 3: Preprocess date columns
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'], errors='coerce')
orders_df['delivery_date'] = pd.to_datetime(orders_df['delivery_date'], errors='coerce')
delivery_status_df['updated_at'] = pd.to_datetime(delivery_status_df['updated_at'], errors='coerce')

# Step 4: Handle missing values and compute delay
orders_df['delivery_date'] = orders_df['delivery_date'].fillna(pd.Timestamp.today())
orders_df['delay_days'] = (orders_df['delivery_date'] - orders_df['order_date']).dt.days
orders_df['is_delayed'] = np.where(orders_df['delay_days'] > 3, 1, 0)

# Step 5: Merge orders with customer information
merged_df = pd.merge(orders_df, customers_df, on='customer_id', how='left')

# Step 6: Analyze delay summary by customer
delay_summary = merged_df.groupby(['customer_id', 'customer_name'])['is_delayed'].sum().reset_index()
delay_summary = delay_summary.sort_values(by='is_delayed', ascending=False)

# Step 7: Save cleaned data
orders_df.to_csv("cleaned_orders.csv", index=False)
print("Cleaned dataset saved as cleaned_orders.csv")
print("Delay summary by customer:")
print(delay_summary)


Cleaned dataset saved as cleaned_orders.csv
Delay summary by customer:
   customer_id customer_name  is_delayed
0            1    Asha Patel           1
1            2  Rohan Sharma           1
2            3    Neha Reddy           1
3            4   Arjun Mehta           1
4            5   Isha Kapoor           1
