In [1]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the dataset from the previous step
# Replace 'bookings_outliers_handled.csv' with the actual file path if not in the same directory
file_path = 'bookings_outliers_handled.csv'
data_cleaned = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_processed = data_cleaned.copy()

In [4]:
# Step 1: Inspect numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("Summary Statistics Before Standardization/Normalization:")
display(data_processed[numeric_columns].describe())

Summary Statistics Before Standardization/Normalization:


Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,105.441984,52.323876,521.891093,14.083777,2.481997,2.482529
std,103.33921,49.353533,437.960673,15.557015,1.991983,1.99277
min,0.0,0.0,128.0,0.0,0.0,0.0
25%,0.0,0.0,242.0,0.0,0.0,0.0
50%,84.0,45.0,386.0,8.0,3.4,3.4
75%,196.0,95.0,621.0,26.0,4.2,4.2
max,287.0,135.0,1899.0,45.0,5.0,5.0


In [5]:
# Step 2: Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

In [6]:
# Step 3: Standardize and normalize numeric columns
for col in numeric_columns:
    # Standardize: Create new column with _std suffix
    data_processed[f'{col}_std'] = standard_scaler.fit_transform(data_processed[[col]])
    # Normalize: Create new column with _norm suffix
    data_processed[f'{col}_norm'] = minmax_scaler.fit_transform(data_processed[[col]])

In [7]:
# Step 4: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist and are numeric
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_processed.columns and data_processed[col].dtype == 'float64':
        data_processed[f'{col}_std'] = standard_scaler.fit_transform(data_processed[[col]])
        data_processed[f'{col}_norm'] = minmax_scaler.fit_transform(data_processed[[col]])
        print(f"\nStandardized and normalized {col}")
    elif col in data_processed.columns:
        print(f"\nSkipping {col}: Contains non-numeric data (string).")


Skipping Canceled_Rides_by_Customer: Contains non-numeric data (string).

Skipping Canceled_Rides_by_Driver: Contains non-numeric data (string).


In [8]:
# Step 5: Validate transformations
print("\nSummary Statistics for Standardized Columns:")
std_columns = [f'{col}_std' for col in numeric_columns]
display(data_processed[std_columns].describe())

print("\nSummary Statistics for Normalized Columns:")
norm_columns = [f'{col}_norm' for col in numeric_columns]
display(data_processed[norm_columns].describe())


Summary Statistics for Standardized Columns:


Unnamed: 0,V_TAT_std,C_TAT_std,Booking_Value_std,Ride_Distance_std,Driver_Ratings_std,Customer_Rating_std
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,-6.910660000000001e-17,-1.834566e-17,-5.903717000000001e-17,-1.006942e-17,3.0828990000000003e-17,6.627888e-17
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-1.020353,-1.06019,-0.8993799,-0.9053051,-1.245999,-1.245774
25%,-1.020353,-1.06019,-0.6390812,-0.9053051,-1.245999,-1.245774
50%,-0.2074923,-0.1483969,-0.310283,-0.3910651,0.4608509,0.460402
75%,0.8763223,0.8647067,0.2262975,0.7659748,0.8624627,0.8618552
max,1.756922,1.67519,3.144382,1.987295,1.264075,1.263308



Summary Statistics for Normalized Columns:


Unnamed: 0,V_TAT_norm,C_TAT_norm,Booking_Value_norm,Ride_Distance_norm,Driver_Ratings_norm,Customer_Rating_norm
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,0.367394,0.387584,0.222412,0.312973,0.496399,0.496506
std,0.360067,0.365582,0.247296,0.345711,0.398397,0.398554
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.06437,0.0,0.0,0.0
50%,0.292683,0.333333,0.14568,0.177778,0.68,0.68
75%,0.682927,0.703704,0.278374,0.577778,0.84,0.84
max,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Step 6: Save the dataset with standardized and normalized columns
data_processed.to_csv('bookings_standardized_normalized.csv', index=False)
print("Dataset with standardized and normalized columns saved as 'bookings_standardized_normalized.csv'")

Dataset with standardized and normalized columns saved as 'bookings_standardized_normalized.csv'
