In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset from the previous step
# Replace 'bookings_no_duplicates.csv' with the actual file path if not in the same directory
file_path = 'bookings_no_duplicates.csv'
data_no_duplicates = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_corrected = data_no_duplicates.copy()

In [3]:
# Step 1: Inspect current data types
print("Current Data Types:")
display(data_corrected.dtypes)
print("\nSample Values for All Columns:")
display(data_corrected.head())
print("\nSample Values for Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:")
display(data_corrected[['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']].head(10))

Current Data Types:


Date                           object
Time                           object
Booking_ID                     object
Booking_Status                 object
Customer_ID                    object
Vehicle_Type                   object
Pickup_Location                object
Drop_Location                  object
V_TAT                         float64
C_TAT                         float64
Canceled_Rides_by_Customer     object
Canceled_Rides_by_Driver       object
Incomplete_Rides               object
Incomplete_Rides_Reason        object
Booking_Value                   int64
Payment_Method                 object
Ride_Distance                   int64
Driver_Ratings                float64
Customer_Rating               float64
Vehicle Images                 object
dtype: object


Sample Values for All Columns:


Unnamed: 0,Date,Time,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,Incomplete_Rides,Incomplete_Rides_Reason,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating,Vehicle Images
0,2024-07-26 14:00:00,14:00:00,CNR7153255142,Canceled by Driver,CID713523,Prime Sedan,Tumkur Road,RT Nagar,0.0,0.0,,Personal & Car related issue,,,444,,0,0.0,0.0,https://cdn-icons-png.flaticon.com/128/14183/1...
1,2024-07-25 22:20:00,22:20:00,CNR2940424040,Success,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,,,No,,158,Cash,13,4.1,4.0,https://cdn-icons-png.flaticon.com/128/9983/99...
2,2024-07-30 19:59:00,19:59:00,CNR2982357879,Success,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,,,No,,386,UPI,40,4.2,4.8,https://cdn-icons-png.flaticon.com/128/9983/99...
3,2024-07-22 03:15:00,03:15:00,CNR2395710036,Canceled by Customer,CID581320,eBike,HSR Layout,Vijayanagar,0.0,0.0,Driver is not moving towards pickup location,,,,384,,0,0.0,0.0,https://cdn-icons-png.flaticon.com/128/6839/68...
4,2024-07-02 09:02:00,09:02:00,CNR1797421769,Success,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,,,No,,822,Credit Card,45,4.0,3.0,https://cdn-icons-png.flaticon.com/128/3202/32...



Sample Values for Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:


Unnamed: 0,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver
0,,Personal & Car related issue
1,,
2,,
3,Driver is not moving towards pickup location,
4,,
5,,
6,,
7,,Personal & Car related issue
8,,
9,,Personal & Car related issue


In [4]:
# Step 2: Handle Date and Time columns
if 'Datetime' in data_corrected.columns:
    # If Datetime exists, ensure it's datetime64
    data_corrected['Datetime'] = pd.to_datetime(data_corrected['Datetime'], errors='coerce')
    print("\nDatetime column already exists and is converted to datetime64.")
elif 'Date' in data_corrected.columns and 'Time' in data_corrected.columns:
    # If Date and Time exist, convert to Datetime
    if data_corrected['Date'].dtype == 'object' or data_corrected['Date'].dtype == 'datetime64[ns]':
        data_corrected['Datetime'] = pd.to_datetime(data_corrected['Date'], errors='coerce')
    else:
        # Fallback for Excel numeric format
        data_corrected['Datetime'] = pd.to_datetime(data_corrected['Date'], unit='d', origin='1899-12-30') + pd.to_timedelta(data_corrected['Time'], unit='d')
    data_corrected.drop(columns=['Date', 'Time'], inplace=True)
    print("\nDate and Time columns combined into Datetime.")
else:
    print("\nWarning: Neither 'Datetime' nor 'Date'/'Time' columns found. Please check the dataset.")


Date and Time columns combined into Datetime.


In [5]:
# Step 3: Convert columns to appropriate data types
# String columns
string_columns = ['Booking_ID', 'Customer_ID', 'Pickup_Location', 'Drop_Location', 'Incomplete_Rides_Reason', 'Vehicle Images']
for col in string_columns:
    if col in data_corrected.columns:
        data_corrected[col] = data_corrected[col].astype('string')

# Categorical columns
categorical_columns = ['Booking_Status', 'Vehicle_Type', 'Payment_Method', 'Incomplete_Rides']
for col in categorical_columns:
    if col in data_corrected.columns:
        data_corrected[col] = data_corrected[col].astype('category')

# Numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
for col in numeric_columns:
    if col in data_corrected.columns:
        data_corrected[col] = pd.to_numeric(data_corrected[col], errors='coerce').astype('float64')

# Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_corrected.columns:
        # Check if the column contains non-numeric data
        if data_corrected[col].dtype == 'object' and data_corrected[col].str.contains(r'[a-zA-Z]', na=False).any():
            print(f"\nWarning: {col} contains text data. Converting to string instead of float.")
            data_corrected[col] = data_corrected[col].astype('string')
        else:
            data_corrected[col] = pd.to_numeric(data_corrected[col], errors='coerce').astype('float64')





In [6]:
# Step 4: Validate data types
print("\nData Types After Correction:")
display(data_corrected.dtypes)


Data Types After Correction:


Booking_ID                    string[python]
Booking_Status                      category
Customer_ID                   string[python]
Vehicle_Type                        category
Pickup_Location               string[python]
Drop_Location                 string[python]
V_TAT                                float64
C_TAT                                float64
Canceled_Rides_by_Customer    string[python]
Canceled_Rides_by_Driver      string[python]
Incomplete_Rides                    category
Incomplete_Rides_Reason       string[python]
Booking_Value                        float64
Payment_Method                      category
Ride_Distance                        float64
Driver_Ratings                       float64
Customer_Rating                      float64
Vehicle Images                string[python]
Datetime                      datetime64[ns]
dtype: object

In [7]:
# Step 5: Save the dataset with corrected data types
data_corrected.to_csv('bookings_corrected_datatypes.csv', index=False)
print("Dataset with corrected data types saved as 'bookings_corrected_datatypes.csv'")

Dataset with corrected data types saved as 'bookings_corrected_datatypes.csv'
