In [56]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Constants
num_rows = 100000
areas_bangalore = [
    'Koramangala', 'Indiranagar', 'Jayanagar', 'Whitefield', 'MG Road', 'BTM Layout', 
    'Rajajinagar', 'Basavanagudi', 'Hebbal', 'Bellandur', 'Hennur', 'Banaswadi', 'Ulsoor',
    'Kengeri', 'Bannerghatta Road', 'Marathahalli', 'KR Puram', 'Sarjapur', 'Vijayanagar',
    'Yelahanka', 'Electronic City', 'CV Raman Nagar', 'Nandi Hills', 'Lalbagh', 'Jakkur', 
    'R T Nagar', 'Domlur', 'Shivaji Nagar', 'Hosur Road', 'Rajarajeshwari Nagar', 'Kothanur',
    'Malleshwaram', 'Peenya', 'Varthur', 'JP Nagar', 'Sarjapur Road', 'Sadashivanagar', 
    'Halasuru', 'Yeshwanthpur', 'Bommanahalli', 'Kudlu Gate', 'Ramagondanahalli', 'Cunningham Road',
    'Vikram Sarabhai Road', 'Seshadripuram', 'Malleswaram', 'Ragigudda', 'Adugodi', 'Kanakapura Road', 'Frazer Town'
]
cancel_reasons_customer = [
    "Driver is not moving towards pickup location", "Driver asked to cancel", "AC is Not working", 
    "Change of plans", "Wrong Address", "Else NA"
]
cancel_reasons_driver = [
    "Personal & Car related issue", "Customer related issue", "Customer was coughing/sick", 
    "More than permitted people in there", "Else NA"
]
vehicle_types = ['Auto', 'Prime Plus', 'Prime Sedan', 'Mini', 'Bike', 'eBike', 'Prime SUV']
payment_methods = ['Cash', 'Credit Card', 'Online Payment']

# Generate the dataset
def generate_data(num_rows):
    data = []
    start_date = datetime(2025, 2, 1)
    
    for i in range(num_rows):
        # Date and Time
        date = start_date + timedelta(days=random.randint(0, 29))
        time = f"{random.randint(0, 23):02}:{random.randint(0, 59):02}"
        
        # Booking ID
        booking_id = f"CNR{random.randint(1000000000, 9999999999)}"
        
        # Booking Status (Success 62%, other statuses 38%)
        booking_status = 'Success' if random.random() < 0.62 else random.choice(['Cancelled', 'Completed', 'Failed'])
        
        # Customer ID
        customer_id = f"CUS{random.randint(10000, 99999)}"
        
        # Vehicle Type
        vehicle_type = random.choice(vehicle_types)
        
        # Pickup and Drop Locations
        pickup_location = random.choice(areas_bangalore)
        drop_location = random.choice(areas_bangalore)
        
        # VTAT and CTAT (only for Success bookings)
        vtat = random.randint(1, 15) if booking_status == 'Success' else None
        ctat = random.randint(1, 15) if booking_status == 'Success' else None
        
        # Cancellation Reasons
        cancel_reason_customer = random.choice(cancel_reasons_customer) if booking_status == 'Cancelled' else None
        cancel_reason_driver = random.choice(cancel_reasons_driver) if booking_status == 'Cancelled' else None
        
        # Incomplete Rides (less than 6% of total)
        incomplete_ride = random.choice([True, False, False, False, False, False])
        incomplete_reason = random.choice(["Customer Demand", "Vehicle Breakdown", "Other Issue"]) if incomplete_ride else None
        
        # Booking Value (70% under 500, 28% above 500, 2% above 1000)
        if random.random() < 0.70:
            booking_value = random.randint(100, 500)
        elif random.random() < 0.28:
            booking_value = random.randint(501, 1000)
        else:
            booking_value = random.randint(1001, 5000)
        
        # Payment Method (random choice from the list)
        payment_method = random.choice(payment_methods)
        
        # Ratings (for successful bookings)
        driver_rating = round(random.uniform(3.0, 5.0), 1) if booking_status == 'Success' else None
        customer_rating = round(random.uniform(3.0, 5.0), 1) if booking_status == 'Success' else None
        
        # Add all the fields to the data
        data.append([
            date.strftime("%Y-%m-%d"), time, booking_id, booking_status, customer_id, vehicle_type, 
            pickup_location, drop_location, vtat, ctat, 
            cancel_reason_customer, cancel_reason_driver, incomplete_ride, incomplete_reason, 
            booking_value, payment_method, random.randint(1, 20), driver_rating, customer_rating
        ])
    
    # Create the DataFrame
    columns = [
        'Date', 'Time', 'Booking ID', 'Booking Status', 'Customer ID', 'Vehicle Type',
        'Pickup Location', 'Drop Location', 'Avg VTAT', 'Avg CTAT', 
        'Canceled Rides by Customer', 'Canceled Rides by Driver', 'Incomplete Rides', 'Incomplete Rides Reason',
        'Booking Value', 'Payment Method', 'Ride Distance', 'Driver Ratings', 'Customer Rating'
    ]
    
    df= pd.DataFrame(data, columns=columns)
    return df


In [58]:
df

Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,Canceled Rides by Customer,Canceled Rides by Driver,Incomplete Rides,Incomplete Rides Reason,Booking Value,Payment Method,Ride Distance,Driver Ratings,Customer Rating
0,2025-02-17,03:50,CNR6401163461,Success,CUS78406,Auto,Shivaji Nagar,Jayanagar,11.0,11.0,,,False,,273,Online Payment,20,4.6,4.9
1,2025-02-18,15:20,CNR8850603035,Completed,CUS24170,Mini,Vijayanagar,Varthur,,,,,False,,3213,Online Payment,15,,
2,2025-02-22,16:53,CNR6169023642,Success,CUS18796,Bike,JP Nagar,Jayanagar,6.0,2.0,,,False,,229,Credit Card,3,3.2,4.5
3,2025-02-19,04:53,CNR7035082891,Success,CUS53478,Prime SUV,Cunningham Road,Varthur,4.0,8.0,,,False,,402,Credit Card,4,3.2,4.8
4,2025-02-14,07:08,CNR1779008205,Completed,CUS58374,Auto,Shivaji Nagar,KR Puram,,,,,True,Customer Demand,256,Online Payment,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2025-02-23,04:26,CNR9347524053,Success,CUS71443,eBike,Jakkur,Hosur Road,7.0,10.0,,,False,,301,Cash,14,4.8,4.5
99996,2025-02-22,09:43,CNR1460207836,Success,CUS41992,eBike,Shivaji Nagar,MG Road,10.0,3.0,,,False,,165,Online Payment,3,4.4,4.3
99997,2025-03-02,21:13,CNR6609796722,Success,CUS40823,Prime Plus,BTM Layout,Shivaji Nagar,3.0,10.0,,,False,,299,Online Payment,13,3.2,3.1
99998,2025-02-28,13:21,CNR3498983028,Failed,CUS80230,Prime Plus,CV Raman Nagar,Yelahanka,,,,,True,Other Issue,467,Online Payment,19,,


In [60]:
df.shape

(100000, 19)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Date                        100000 non-null  object 
 1   Time                        100000 non-null  object 
 2   Booking ID                  100000 non-null  object 
 3   Booking Status              100000 non-null  object 
 4   Customer ID                 100000 non-null  object 
 5   Vehicle Type                100000 non-null  object 
 6   Pickup Location             100000 non-null  object 
 7   Drop Location               100000 non-null  object 
 8   Avg VTAT                    61843 non-null   float64
 9   Avg CTAT                    61843 non-null   float64
 10  Canceled Rides by Customer  12677 non-null   object 
 11  Canceled Rides by Driver    12677 non-null   object 
 12  Incomplete Rides            100000 non-null  bool   
 13  Incomplete Ride

In [82]:
pd.to_datetime(df['Date'], format="%Y-%m-%d")

0       2025-02-17
1       2025-02-18
2       2025-02-22
3       2025-02-19
4       2025-02-14
           ...    
99995   2025-02-23
99996   2025-02-22
99997   2025-03-02
99998   2025-02-28
99999   2025-02-11
Name: Date, Length: 100000, dtype: datetime64[ns]

In [91]:
df.duplicated().sum()

0