In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
file_path = "data/Bookings-20000-Rows.xlsx"
df = pd.read_excel(file_path)

# Cleaning
df['Payment_Method'] = df['Payment_Method'].fillna("Unknown")
df['Driver_Ratings'] = df['Driver_Ratings'].fillna(df['Driver_Ratings'].mean())
df['Customer_Rating'] = df['Customer_Rating'].fillna(df['Customer_Rating'].mean())

columns_to_drop = ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver',
                   'Incomplete_Rides_Reason', 'Vehicle Images']
df = df.drop(columns=columns_to_drop, errors='ignore')

df = df[(df['Booking_Value'] > 0) & (df['V_TAT'] > 0)]

if 'Trip_Date' in df.columns:
    df['Trip_Date'] = pd.to_datetime(df['Trip_Date'], errors='coerce')

# Remove outliers in Booking_Value
Q1 = df['Booking_Value'].quantile(0.25)
Q3 = df['Booking_Value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Booking_Value'] >= lower_bound) & (df['Booking_Value'] <= upper_bound)]

# Visualization 1: Payment Method Distribution
plt.figure(figsize=(12, 6))
sns.countplot(x='Payment_Method', data=df, hue='Payment_Method', dodge=False, palette='Set2', legend=False)
plt.title("Payment Method Distribution")
plt.xlabel("Payment Method")
plt.ylabel("Number of Rides")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("data/payment_method_distribution.png")
plt.close()

# Visualization 2: Fare Distribution
plt.figure(figsize=(12, 6))
sns.histplot(df['Booking_Value'], bins=40, kde=True, color='skyblue')
plt.title("Fare Distribution")
plt.xlabel("Fare Amount")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("data/fare_distribution.png")
plt.close()

# Visualization 3: Fare by Ride Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='Vehicle_Type', y='Booking_Value', data=df, hue='Vehicle_Type', dodge=False, palette='pastel', legend=False)
plt.title("Fare by Ride Type")
plt.xlabel("Vehicle Type")
plt.ylabel("Fare Amount")
plt.tight_layout()
plt.savefig("data/fare_by_ride_type.png")
plt.close()

# Visualization 4: Ratings Distribution
plt.figure(figsize=(12, 6))
sns.kdeplot(df['Driver_Ratings'], label='Driver Ratings', fill=True)
sns.kdeplot(df['Customer_Rating'], label='Customer Ratings', fill=True)
plt.title("Ratings Distribution")
plt.xlabel("Rating")
plt.legend()
plt.tight_layout()
plt.savefig("data/ratings_distribution.png")
plt.close()

# Save the cleaned data to a new Excel file
df.to_excel("data/cleaned_data.xlsx", index=False)
df.to_json("data/cleaned_data.json", orient="records")

# Output file paths
print("Saved plots:", 
      "data/payment_method_distribution.png", 
      "data/fare_distribution.png", 
      "data/fare_by_ride_type.png", 
      "data/ratings_distribution.png")
print("Cleaned data saved to: data/cleaned_data.xlsx")



Saved plots: data/payment_method_distribution.png data/fare_distribution.png data/fare_by_ride_type.png data/ratings_distribution.png
Cleaned data saved to: data/cleaned_data.xlsx
