In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

In [2]:
fake = Faker()

In [3]:
#Parameters
num_orders = 100000  # Total rows
num_delivery_persons = 400  # Number of delivery persons
cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Hyderabad", "Kolkata", "Pune", "Kochi"]
start_date = datetime(2025, 1, 1)  # Start of the month
end_date = datetime(2025, 1, 31)  # End of the month

In [4]:
# Assign each delivery person to only one city (dictionary)
delivery_persons = {f"D{str(i).zfill(4)}": random.choice(cities) for i in range(1, num_delivery_persons + 1)}

In [5]:
# Defining Peak Hour Probability Weights (higher chance during 12-3 PM & 7-10 PM)
hourly_weights = np.array([0.05] * 11 + [0.15] * 3 + [0.05] * 4 + [0.15] * 3 + [0.05] * 3)
hourly_weights /= hourly_weights.sum()  # Normalize to sum to 1

In [6]:
# Defining Order Quantity Probability Distribution (Higher for 2-5 items)
order_quantity_choices = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
order_quantity_probs = np.array([0.10, 0.20, 0.20, 0.15, 0.15, 0.10, 0.10, 0.05, 0.05, 0.05])
order_quantity_probs /= order_quantity_probs.sum()  # Normalize to exactly 1

In [7]:
orders = []
for i in range(1, num_orders + 1):
    order_id = f"O{str(i).zfill(6)}"
    
    # Select a delivery person and get their assigned city
    delivery_person_id = random.choice(list(delivery_persons.keys())) 
    city = delivery_persons[delivery_person_id] 
    
    # Order Quantity: Weighted selection
    order_quantity = np.random.choice(order_quantity_choices, p=order_quantity_probs)

    # Base Order Value: Normally distributed price per item (₹250 mean, ₹100 std)
    base_value = np.random.normal(250, 100, 1)[0]

    # Scaling Factor based on Quantity (higher quantity → discount per item)
    scaling_factor = 1.2 if order_quantity <= 2 else (1.5 if order_quantity <= 5 else (1.8 if order_quantity <= 8 else 2.5))

    # Final Order Value Calculation (Clipped between ₹50 - ₹1500)
    order_value = max(50, min(1500, base_value * order_quantity * (np.random.uniform(0.9, 1.1) * scaling_factor)))
    order_value = round(order_value, 3)

    # Order Time: More likely during peak hours
    order_date = fake.date_between(start_date=start_date, end_date=end_date)
    order_hour = np.random.choice(range(24), p=hourly_weights)  # Weighted selection
    order_minute = np.random.randint(0, 60)
    order_time = datetime(order_date.year, order_date.month, order_date.day, order_hour, order_minute)

    # Delivery Time: Random delay of 20-90 minutes
    delivery_time = order_time + timedelta(minutes=np.random.randint(20, 90))

    # Order Rating: More 4s and 5s
    order_rating = np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.1, 0.25, 0.35, 0.25])

    orders.append([delivery_person_id, order_id, order_value, order_quantity, city, order_time, delivery_time, order_rating])

In [8]:
df = pd.DataFrame(orders, columns=["delivery_person_id", "order_id", "order_value", "order_quantity", "city", "order_timestamp", "delivery_timestamp", "order_rating"])

In [9]:
df.head()

Unnamed: 0,delivery_person_id,order_id,order_value,order_quantity,city,order_timestamp,delivery_timestamp,order_rating
0,D0325,O000001,1455.754,4,Kochi,2025-01-16 18:27:00,2025-01-16 19:09:00,3
1,D0153,O000002,617.571,5,Kolkata,2025-01-27 11:18:00,2025-01-27 12:08:00,4
2,D0265,O000003,183.42,1,Kolkata,2025-01-06 19:27:00,2025-01-06 20:48:00,3
3,D0222,O000004,157.547,4,Kochi,2025-01-28 09:58:00,2025-01-28 10:51:00,1
4,D0187,O000005,1500.0,6,Mumbai,2025-01-28 18:46:00,2025-01-28 19:24:00,3


In [10]:
df.to_csv("C:/Users/DELL/Downloads/Swiggy Case Study/swiggy_orders_final.csv", index=False)