In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset from the previous step
# Replace 'bookings_corrected_datatypes.csv' with the actual file path if not in the same directory
file_path = 'bookings_corrected_datatypes.csv'
data_corrected = pd.read_csv(file_path)

In [3]:
# Create a copy of the dataset to preserve the loaded data
data_cleaned = data_corrected.copy()

In [4]:
# Step 1: Inspect numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("Summary Statistics Before Handling Outliers:")
display(data_cleaned[numeric_columns].describe())

Summary Statistics Before Handling Outliers:


Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,106.096502,52.697381,548.751883,14.189927,2.481997,2.482529
std,104.532203,50.00509,536.541221,15.77627,1.991983,1.99277
min,0.0,0.0,100.0,0.0,0.0,0.0
25%,0.0,0.0,242.0,0.0,0.0,0.0
50%,84.0,45.0,386.0,8.0,3.4,3.4
75%,196.0,95.0,621.0,26.0,4.2,4.2
max,308.0,145.0,2999.0,49.0,5.0,5.0


In [5]:
# Step 2: Handle outliers for numeric columns
for col in numeric_columns:
    if col in ['Driver_Ratings', 'Customer_Rating']:
        # Cap ratings between 0 and 5
        data_cleaned[col] = data_cleaned[col].clip(lower=0, upper=5)
        print(f"\nRatings capped between 0 and 5 for {col}")
    else:
        # Calculate Q1, Q3, and IQR for other numeric columns
        Q1 = data_cleaned[col].quantile(0.25)
        Q3 = data_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap outliers at 5th and 95th percentiles
        lower_cap = data_cleaned[col].quantile(0.05)
        upper_cap = data_cleaned[col].quantile(0.95)
        data_cleaned[col] = data_cleaned[col].clip(lower=lower_cap, upper=upper_cap)
        print(f"\nOutliers capped for {col}:")
        print(f"Lower cap (5th percentile): {lower_cap:.2f}")
        print(f"Upper cap (95th percentile): {upper_cap:.2f}")


Outliers capped for V_TAT:
Lower cap (5th percentile): 0.00
Upper cap (95th percentile): 287.00

Outliers capped for C_TAT:
Lower cap (5th percentile): 0.00
Upper cap (95th percentile): 135.00

Outliers capped for Booking_Value:
Lower cap (5th percentile): 128.00
Upper cap (95th percentile): 1899.00

Outliers capped for Ride_Distance:
Lower cap (5th percentile): 0.00
Upper cap (95th percentile): 45.00

Ratings capped between 0 and 5 for Driver_Ratings

Ratings capped between 0 and 5 for Customer_Rating


In [6]:
# Step 3: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist and are numeric
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_cleaned.columns:
        if data_cleaned[col].dtype == 'float64':
            Q1 = data_cleaned[col].quantile(0.25)
            Q3 = data_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_cap = data_cleaned[col].quantile(0.05)
            upper_cap = data_cleaned[col].quantile(0.95)
            data_cleaned[col] = data_cleaned[col].clip(lower=lower_cap, upper=upper_cap)
            print(f"\nOutliers capped for {col}:")
            print(f"Lower cap (5th percentile): {lower_cap:.2f}")
            print(f"Upper cap (95th percentile): {upper_cap:.2f}")
        else:
            print(f"\nSkipping {col}: Contains non-numeric data (string).")


Skipping Canceled_Rides_by_Customer: Contains non-numeric data (string).

Skipping Canceled_Rides_by_Driver: Contains non-numeric data (string).


In [7]:
# Step 4: Validate outliers
print("\nSummary Statistics After Handling Outliers:")
display(data_cleaned[numeric_columns].describe())


Summary Statistics After Handling Outliers:


Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,105.441984,52.323876,521.891093,14.083777,2.481997,2.482529
std,103.33921,49.353533,437.960673,15.557015,1.991983,1.99277
min,0.0,0.0,128.0,0.0,0.0,0.0
25%,0.0,0.0,242.0,0.0,0.0,0.0
50%,84.0,45.0,386.0,8.0,3.4,3.4
75%,196.0,95.0,621.0,26.0,4.2,4.2
max,287.0,135.0,1899.0,45.0,5.0,5.0


In [8]:
# Step 5: Save the dataset with handled outliers
data_cleaned.to_csv('bookings_outliers_handled.csv', index=False)
print("Dataset with handled outliers saved as 'bookings_outliers_handled.csv'")

Dataset with handled outliers saved as 'bookings_outliers_handled.csv'
