# **Import Libraries**

In [3]:
# Core data and numerical libraries
import pandas as pd
import numpy as np

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

print("Libraries imported successfully.")


Libraries imported successfully.


# **Load Dataset**

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yashdevladdha/uber-ride-analytics-dashboard")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'uber-ride-analytics-dashboard' dataset.
Path to dataset files: /kaggle/input/uber-ride-analytics-dashboard


In [5]:
import os

# List files in the downloaded directory to find the CSV file
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

if csv_files:
    # Assuming there's only one relevant CSV file, or picking the first one
    csv_file_name = csv_files[0]
    full_csv_path = os.path.join(path, csv_file_name)
    df = pd.read_csv(full_csv_path)
    print(f"Dataset '{csv_file_name}' loaded successfully.")
else:
    print("No CSV files found in the dataset directory.")
    df = None # Or handle the error as appropriate

Dataset 'ncr_ride_bookings.csv' loaded successfully.


# **Read Data**

In [6]:
print("---Info----")
df.info()

---Info----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Date                               150000 non-null  object 
 1   Time                               150000 non-null  object 
 2   Booking ID                         150000 non-null  object 
 3   Booking Status                     150000 non-null  object 
 4   Customer ID                        150000 non-null  object 
 5   Vehicle Type                       150000 non-null  object 
 6   Pickup Location                    150000 non-null  object 
 7   Drop Location                      150000 non-null  object 
 8   Avg VTAT                           139500 non-null  float64
 9   Avg CTAT                           102000 non-null  float64
 10  Cancelled Rides by Customer        10500 non-null   float64
 11  Reason for cancelling by Cu

In [7]:
print("\n DESCRIBE ")
print(df.describe())


 DESCRIBE 
            Avg VTAT       Avg CTAT  Cancelled Rides by Customer  \
count  139500.000000  102000.000000                      10500.0   
mean        8.456352      29.149636                          1.0   
std         3.773564       8.902577                          0.0   
min         2.000000      10.000000                          1.0   
25%         5.300000      21.600000                          1.0   
50%         8.300000      28.800000                          1.0   
75%        11.300000      36.800000                          1.0   
max        20.000000      45.000000                          1.0   

       Cancelled Rides by Driver  Incomplete Rides  Booking Value  \
count                    27000.0            9000.0  102000.000000   
mean                         1.0               1.0     508.295912   
std                          0.0               0.0     395.805774   
min                          1.0               1.0      50.000000   
25%                          1

In [8]:
df.head(10)

Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,...,Reason for cancelling by Customer,Cancelled Rides by Driver,Driver Cancellation Reason,Incomplete Rides,Incomplete Rides Reason,Booking Value,Ride Distance,Driver Ratings,Customer Rating,Payment Method
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,...,,,,,,,,,,
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,,,,1.0,Vehicle Breakdown,237.0,5.73,,,UPI
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,...,,,,,,627.0,13.58,4.9,4.9,Debit Card
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,...,,,,,,416.0,34.02,4.6,5.0,UPI
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,...,,,,,,737.0,48.21,4.1,4.3,UPI
5,2024-02-06,09:44:56,"""CNR4096693""",Completed,"""CID4670564""",Auto,AIIMS,Narsinghpur,5.1,18.1,...,,,,,,316.0,4.85,4.1,4.6,UPI
6,2024-06-17,15:45:58,"""CNR2002539""",Completed,"""CID6800553""",Go Mini,Vaishali,Punjabi Bagh,7.1,20.4,...,,,,,,640.0,41.24,4.0,4.1,UPI
7,2024-03-19,17:37:37,"""CNR6568000""",Completed,"""CID8610436""",Auto,Mayur Vihar,Cyber Hub,12.1,16.5,...,,,,,,136.0,6.56,4.4,4.2,UPI
8,2024-09-14,12:49:09,"""CNR4510807""",No Driver Found,"""CID7873618""",Go Sedan,Noida Sector 62,Noida Sector 18,,,...,,,,,,,,,,
9,2024-12-16,19:06:48,"""CNR7721892""",Incomplete,"""CID5214275""",Auto,Rohini,Adarsh Nagar,6.1,26.0,...,,,,1.0,Other Issue,135.0,10.36,,,Cash


In [9]:
df.columns

Index(['Date', 'Time', 'Booking ID', 'Booking Status', 'Customer ID',
       'Vehicle Type', 'Pickup Location', 'Drop Location', 'Avg VTAT',
       'Avg CTAT', 'Cancelled Rides by Customer',
       'Reason for cancelling by Customer', 'Cancelled Rides by Driver',
       'Driver Cancellation Reason', 'Incomplete Rides',
       'Incomplete Rides Reason', 'Booking Value', 'Ride Distance',
       'Driver Ratings', 'Customer Rating', 'Payment Method'],
      dtype='object')

In [10]:
df.shape

(150000, 21)

# Cleaning Data

In [11]:
# Clean Booking ID and Customer ID
df['Booking ID'] = df['Booking ID'].str.replace('"', '').str.strip()
df['Customer ID'] = df['Customer ID'].str.replace('"', '').str.strip()

df[['Booking ID', 'Customer ID']].head()


#نظّفنا رقم الحجز وشلنا منهم علامات التنصيص والمسافات ,عشان مايصير نفس ال الايدي ينقرأ كانه قيمتين مختلفه.  ب الطريقه ذي نقدر نجمع ونحلل البيانات بدون لخبطة

Unnamed: 0,Booking ID,Customer ID
0,CNR5884300,CID1982111
1,CNR1326809,CID4604802
2,CNR8494506,CID9202816
3,CNR8906825,CID2610914
4,CNR1950162,CID9933542


In [12]:
# Convert date
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Combine Date + Time
df['datetime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'], errors='coerce')

# Extract time features
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek  # Monday=0

df[['datetime', 'hour', 'dayofweek']].head()


#وحّدنا التاريخ والوقت في عمود واحد لأننا نحتاج نحلل الطلب والإلغاءات عبر الساعات والأيام.
# استخرجنا الساعات و ايام العمل  لاستخدامها في نماذج التنبؤ بالطلب والتأخير.

Unnamed: 0,datetime,hour,dayofweek
0,2024-03-23 12:29:38,12,5
1,2024-11-29 18:01:39,18,4
2,2024-08-23 08:56:10,8,4
3,2024-10-21 17:17:25,17,0
4,2024-09-16 22:08:00,22,0


In [13]:
cancel_cols = [
    'Cancelled Rides by Customer',
    'Cancelled Rides by Driver',
    'Incomplete Rides'
]

for col in cancel_cols:
    df[col] = df[col].fillna(0).astype(int)

df[cancel_cols].head()


#حولنا اعمدة الالغاء من nan --- 0 or 1
#للان ال nan ----- معناها م صار الغاء
#ف بدال ما نحذف الصف احسن نخليه 0 لاننا نحتاج هذي المعلومات
#كذا يفهم النموذج الالغاء صح ومانضيع شي من الداتا المهمة

Unnamed: 0,Cancelled Rides by Customer,Cancelled Rides by Driver,Incomplete Rides
0,0,0,0
1,0,0,1
2,0,0,0
3,0,0,0
4,0,0,0


In [14]:
df['is_cancelled'] = df['Booking Status'].apply(
    lambda x: 1 if x in [
        'Cancelled by Customer',
        'Cancelled by Driver',
        'No Driver Found',
        'Incomplete'
    ] else 0
)

df[['Booking Status', 'is_cancelled']].head()


#انشات متغير عشان يميز الرحلات المكنسله من المكتمله --- عشان التنبؤ بالغاء الرحله

Unnamed: 0,Booking Status,is_cancelled
0,No Driver Found,1
1,Incomplete,1
2,Completed,0
3,Completed,0
4,Completed,0


In [18]:
df['cancel_type'] = df['Booking Status'].map({
    'Completed': 'none',
    'Cancelled by Customer': 'customer',
    'Cancelled by Driver': 'driver',
    'No Driver Found': 'no_driver',
    'Incomplete': 'incomplete'

})

#خليت ال كانسل تايب يصنف نوع الالغاء اذا من العميل او السائق او لايوجد سائق ---حيساعدنا ف ال clustring

**Text Cleaning**

In [16]:
text_cols = [
    'Vehicle Type',
    'Pickup Location',
    'Drop Location',
    'Payment Method',
    'Reason for cancelling by Customer',
    'Driver Cancellation Reason',
    'Incomplete Rides Reason'
]

for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()

df[text_cols].head()



#نظّفنا الأعمدة النصية وشلّينا المسافات وخليّنا الكتابة كلها small letters،
# عشان ما يصير المكان مكتوب بأكثر من طريقة ويخرب علينا التحليل والـ grouping
#يعني سوّينا توحيد للنصوص: شلّنا المسافات، وخليّنا كل الكتابة بحروف صغيرة، عشان ما يصير عندنا نفس القيمة مكررة بأكثر من شكل. هذا يسهّل علينا التحليل والتجميع والكلسترنق.

Unnamed: 0,Vehicle Type,Pickup Location,Drop Location,Payment Method,Reason for cancelling by Customer,Driver Cancellation Reason,Incomplete Rides Reason
0,ebike,palam vihar,jhilmil,,,,
1,go sedan,shastri nagar,gurgaon sector 56,upi,,,vehicle breakdown
2,auto,khandsa,malviya nagar,debit card,,,
3,premier sedan,central secretariat,inderlok,upi,,,
4,bike,ghitorni village,khan market,upi,,,


**Numeric Columns Cleaning**

In [17]:
num_cols = [
    'Avg VTAT', 'Avg CTAT', 'Booking Value', 'Ride Distance',
    'Driver Ratings', 'Customer Rating'
]

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

df[num_cols].dtypes


#حوّلنا الأعمدة الرقمية لنوع float
#عشان نتأكد إن العمليات الحسابية والرسوم البيانية تشتغل صح، وما تكون فيه قيم مخزنة كنصوص.

Unnamed: 0,0
Avg VTAT,float64
Avg CTAT,float64
Booking Value,float64
Ride Distance,float64
Driver Ratings,float64
Customer Rating,float64


**Detection Outliers**

In [19]:
df[num_cols].describe()

Unnamed: 0,Avg VTAT,Avg CTAT,Booking Value,Ride Distance,Driver Ratings,Customer Rating
count,139500.0,102000.0,102000.0,102000.0,93000.0,93000.0
mean,8.456352,29.149636,508.295912,24.637012,4.230992,4.404584
std,3.773564,8.902577,395.805774,14.002138,0.436871,0.437819
min,2.0,10.0,50.0,1.0,3.0,3.0
25%,5.3,21.6,234.0,12.46,4.1,4.2
50%,8.3,28.8,414.0,23.72,4.3,4.5
75%,11.3,36.8,689.0,36.82,4.6,4.8
max,20.0,45.0,4277.0,50.0,5.0,5.0


**Missing Indicators**

In [20]:
# بدل ما نحذف ال NAN
#نضيف عمود جديد يقول هل القيمة مفقودة ؟ 0 او 1
for col in num_cols:
    df[col + '_missing'] = df[col].isna().astype(int)

#أضفنا أعمدة missing indicators.
# هذه الأعمدة تقول للموديل إذا كانت القيمة مفقودة. الدراسات تثبت إن هذه الطريقة تساعد النماذج تتنبأ أفضل لأنها تفهم نمط القيم المفقودة.

**Negative / Impossible Value**

In [21]:
# اذا لقينا قيمة سالبه معناته في خطا ف البيانات
# Check for negative values
for col in ['Booking Value', 'Ride Distance', 'Driver Ratings', 'Customer Rating']:
    negatives = df[df[col] < 0]
    print(col, "Negative count =", len(negatives))

Booking Value Negative count = 0
Ride Distance Negative count = 0
Driver Ratings Negative count = 0
Customer Rating Negative count = 0


**Text Cleaning**

In [31]:
text_cols = ['Pickup Location', 'Drop Location', 'Vehicle Type', 'Payment Method']

for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()


print(df.head)

#رتبت النصوص , شلت المسافات الزايدة والحروف الكبيره عشان مايصير عندي نفس المكان مكرر بس لانه مكتوب بطريقة مختلفة

<bound method NDFrame.head of              Date      Time  Booking ID   Booking Status Customer ID  \
0      2024-03-23  12:29:38  CNR5884300  No Driver Found  CID1982111   
1      2024-11-29  18:01:39  CNR1326809       Incomplete  CID4604802   
2      2024-08-23  08:56:10  CNR8494506        Completed  CID9202816   
3      2024-10-21  17:17:25  CNR8906825        Completed  CID2610914   
4      2024-09-16  22:08:00  CNR1950162        Completed  CID9933542   
...           ...       ...         ...              ...         ...   
149995 2024-11-11  19:34:01  CNR6500631        Completed  CID4337371   
149996 2024-11-24  15:55:09  CNR2468611        Completed  CID2325623   
149997 2024-09-18  10:55:15  CNR6358306        Completed  CID9925486   
149998 2024-10-05  07:53:34  CNR3030099        Completed  CID9415487   
149999 2024-03-10  15:38:03  CNR3447390        Completed  CID4108667   

         Vehicle Type         Pickup Location      Drop Location  Avg VTAT  \
0               ebike      

In [33]:
# Feature engineering
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df['day_of_week'] = df['datetime'].dt.dayofweek   # Monday=0 .. Sunday=6
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)

In [34]:
cat_cols = ['Vehicle Type', 'cancel_type', 'Payment Method',
            'Pickup Location', 'Drop Location']

for col in cat_cols:
    df[col] = df[col].astype('category')


#حوّلت الأعمدة اللي فيها كلمات مو ارقام لصيغة category
#عشان الموديلز تفهم المعلومات

**Final Cleaning Check**

In [35]:
# Final check for NaN counts
print("Missing values after cleaning:\n", df.isnull().sum())

#print final shape
print("\nFinal shape:", df.shape)

# Check data types
print("\nData types:\n", df.dtypes)

# Statistical summary
print("\nSummary statistics:\n", df.describe(include='all'))

#print null
print("Check of null value" ,df.isna().sum())

Missing values after cleaning:
 Date                                     0
Time                                     0
Booking ID                               0
Booking Status                           0
Customer ID                              0
Vehicle Type                             0
Pickup Location                          0
Drop Location                            0
Avg VTAT                             10500
Avg CTAT                             48000
Cancelled Rides by Customer              0
Reason for cancelling by Customer        0
Cancelled Rides by Driver                0
Driver Cancellation Reason               0
Incomplete Rides                         0
Incomplete Rides Reason                  0
Booking Value                        48000
Ride Distance                        48000
Driver Ratings                       57000
Customer Rating                      57000
Payment Method                           0
datetime                                 0
hour                  

In [37]:
df_cleaning = df.to_csv("cleaned_uber_dataset.csv")

In [None]:
import os
os.path.exists("cleaned_uber_dataset.csv")

True

# **Preprocessing**