In [156]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import os
from geopy.distance import geodesic

In [337]:
data = pd.read_csv("YourCabs.csv")

In [339]:
print(data)

           id  user_id  vehicle_model_id  package_id  travel_type_id  \
0      132512    22177                28         NaN               2   
1      132513    21413                12         NaN               2   
2      132514    22178                12         NaN               2   
3      132515    13034                12         NaN               2   
4      132517    22180                12         NaN               2   
...       ...      ...               ...         ...             ...   
43426  185937    14364                12         NaN               2   
43427  185938    48727                12         NaN               2   
43428  185939    48729                12         1.0               3   
43429  185940    30724                87         NaN               2   
43430  185941    48730                12         NaN               2   

       from_area_id  to_area_id  from_city_id  to_city_id         from_date  \
0              83.0       448.0           NaN         Na

In [160]:
data = data.drop(columns=['id', 'user_id', 'vehicle_model_id'])

In [161]:
print(data.head())

   package_id  travel_type_id  from_area_id  to_area_id  from_city_id  \
0         NaN               2          83.0       448.0           NaN   
1         NaN               2        1010.0       540.0           NaN   
2         NaN               2        1301.0      1034.0           NaN   
3         NaN               2         768.0       398.0           NaN   
4         NaN               2        1365.0       849.0           NaN   

   to_city_id         from_date  online_booking  mobile_site_booking  \
0         NaN  01-01-2013 02:00               0                    0   
1         NaN  01-01-2013 09:00               0                    0   
2         NaN  01-01-2013 03:30               0                    0   
3         NaN  01-01-2013 05:45               0                    0   
4         NaN  01-01-2013 09:00               0                    0   

    booking_created   from_lat  from_long     to_lat    to_long  \
0  01-01-2013 01:39  12.924150  77.672290  12.927320  77.6357

In [162]:
data_type_1 = data[data['travel_type_id'] == 1]
data_type_2 = data[data['travel_type_id'] == 2]
data_type_3 = data[data['travel_type_id'] == 3]

In [163]:
print("Data for travel_type_id = 1:")
print(data_type_1.head())

print("\nData for travel_type_id = 2:")
print(data_type_2.head())

print("\nData for travel_type_id = 3:")
print(data_type_3.head())

Data for travel_type_id = 1:
     package_id  travel_type_id  from_area_id  to_area_id  from_city_id  \
6           NaN               1         571.0         NaN          15.0   
17          NaN               1        1383.0         NaN           NaN   
24          NaN               1         515.0         NaN          15.0   
158         NaN               1         136.0         NaN           1.0   
216         NaN               1        1118.0         NaN           NaN   

     to_city_id         from_date  online_booking  mobile_site_booking  \
6         108.0  01-01-2013 09:45               0                    0   
17         32.0  01-12-2013 08:00               1                    0   
24         32.0  01-05-2013 08:30               0                    0   
158       152.0  01-04-2013 09:30               1                    0   
216        32.0  01-06-2013 07:30               1                    0   

      booking_created   from_lat  from_long  to_lat  to_long  Car_Cancellat

In [164]:
data_type_1 = data[data['travel_type_id'] == 1]

In [165]:
data_type_1 = data_type_1.drop(columns=['from_lat', 'from_long', 'to_lat', 'to_long'])

In [166]:
print("Data for travel_type_id = 1 after dropping latitude and longitude columns:")
print(data_type_1.head())

Data for travel_type_id = 1 after dropping latitude and longitude columns:
     package_id  travel_type_id  from_area_id  to_area_id  from_city_id  \
6           NaN               1         571.0         NaN          15.0   
17          NaN               1        1383.0         NaN           NaN   
24          NaN               1         515.0         NaN          15.0   
158         NaN               1         136.0         NaN           1.0   
216         NaN               1        1118.0         NaN           NaN   

     to_city_id         from_date  online_booking  mobile_site_booking  \
6         108.0  01-01-2013 09:45               0                    0   
17         32.0  01-12-2013 08:00               1                    0   
24         32.0  01-05-2013 08:30               0                    0   
158       152.0  01-04-2013 09:30               1                    0   
216        32.0  01-06-2013 07:30               1                    0   

      booking_created  Car_Ca

#### Calculate the cancellation percentage for each from_area_id

In [168]:
cancellation_rate = data_type_1.groupby('from_area_id')['Car_Cancellation'].mean() * 100

In [169]:
data_type_1['Cancellation_Percentage'] = data_type_1['from_area_id'].map(cancellation_rate)

In [170]:
def categorize_cancellation(percentage):
    if percentage == 0:
        return "No Cancellation"
    elif percentage < 30:
        return "Low Cancellation"
    elif 30 <= percentage <= 70:
        return "Moderate Cancellation"
    else:
        return "High Cancellation"

data_type_1['Cancellation_Category'] = data_type_1['Cancellation_Percentage'].apply(categorize_cancellation)


In [171]:
print("Data for travel_type_id = 1 after transformation:")
print(data_type_1[['from_area_id', 'Cancellation_Percentage', 'Cancellation_Category']].head())

Data for travel_type_id = 1 after transformation:
     from_area_id  Cancellation_Percentage Cancellation_Category
6           571.0                 2.040816      Low Cancellation
17         1383.0                 0.000000       No Cancellation
24          515.0                 0.000000       No Cancellation
158         136.0                 0.000000       No Cancellation
216        1118.0                 0.000000       No Cancellation


In [172]:
data_type_1 = data_type_1.drop(columns=['from_area_id', 'to_area_id', 'package_id', 'travel_type_id'])
print("Data for travel_type_id = 1 after removing specified columns:")
print(data_type_1.head())

Data for travel_type_id = 1 after removing specified columns:
     from_city_id  to_city_id         from_date  online_booking  \
6            15.0       108.0  01-01-2013 09:45               0   
17            NaN        32.0  01-12-2013 08:00               1   
24           15.0        32.0  01-05-2013 08:30               0   
158           1.0       152.0  01-04-2013 09:30               1   
216           NaN        32.0  01-06-2013 07:30               1   

     mobile_site_booking   booking_created  Car_Cancellation  \
6                      0  01-01-2013 09:21                 0   
17                     0  01-01-2013 12:17                 0   
24                     0  01-01-2013 14:40                 0   
158                    0  01-03-2013 12:19                 0   
216                    0  01-04-2013 09:32                 0   

     Cancellation_Percentage Cancellation_Category  
6                   2.040816      Low Cancellation  
17                  0.000000       No Cancel

####  Create a 'route' column

In [174]:
data_type_1['route'] = data_type_1['from_city_id'].astype(str) + '_' + data_type_1['to_city_id'].astype(str)

In [175]:
cancellation_rate_route = data_type_1.groupby('route')['Car_Cancellation'].mean() * 100
data_type_1['Cancellation_Percentage_Route'] = data_type_1['route'].map(cancellation_rate_route)

In [176]:
data_type_1 = data_type_1.drop(columns=['from_city_id', 'to_city_id'])
print("Data for travel_type_id = 1 after processing routes and cancellations:")
print(data_type_1.head())

Data for travel_type_id = 1 after processing routes and cancellations:
            from_date  online_booking  mobile_site_booking   booking_created  \
6    01-01-2013 09:45               0                    0  01-01-2013 09:21   
17   01-12-2013 08:00               1                    0  01-01-2013 12:17   
24   01-05-2013 08:30               0                    0  01-01-2013 14:40   
158  01-04-2013 09:30               1                    0  01-03-2013 12:19   
216  01-06-2013 07:30               1                    0  01-04-2013 09:32   

     Car_Cancellation  Cancellation_Percentage Cancellation_Category  \
6                   0                 2.040816      Low Cancellation   
17                  0                 0.000000       No Cancellation   
24                  0                 0.000000       No Cancellation   
158                 0                 0.000000       No Cancellation   
216                 0                 0.000000       No Cancellation   

          route

#### 'from_date' and 'booking_created '

In [178]:
data_type_1['from_date'] = pd.to_datetime(data_type_1['from_date'], format='%m-%d-%Y %H:%M', errors='coerce')
data_type_1['booking_created'] = pd.to_datetime(data_type_1['booking_created'], format='%m-%d-%Y %H:%M', errors='coerce')

In [179]:
data_type_1['Month'] = data_type_1['from_date'].dt.month
data_type_1['Day'] = data_type_1['from_date'].dt.day
data_type_1['Hour'] = data_type_1['from_date'].dt.hour

In [180]:
def time_of_day(hour):
    if pd.isna(hour):
        return 'Unknown'
    elif 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

data_type_1['Time_of_Day'] = data_type_1['Hour'].apply(time_of_day)

In [181]:
data_type_1['Weekday'] = data_type_1['from_date'].dt.weekday

In [182]:
def weekday_or_weekend(day):
    if pd.isna(day):
        return 'Unknown'
    elif day < 5:  # Weekday (Monday=0, Sunday=6)
        return 'Weekday'
    else:
        return 'Weekend'

data_type_1['Weekday_Weekend'] = data_type_1['Weekday'].apply(weekday_or_weekend)

In [183]:
data_type_1['Hours_Before_Start'] = (data_type_1['from_date'] - data_type_1['booking_created']).dt.total_seconds() / 3600

In [184]:
data_type_1 = data_type_1.drop(columns=['Hour', 'Weekday'])

In [185]:
print("Final Data with extracted date features and booking time difference:")
print(data_type_1[['from_date', 'booking_created', 'Month', 'Day', 'Time_of_Day', 'Weekday_Weekend', 'Hours_Before_Start']].head())

Final Data with extracted date features and booking time difference:
              from_date     booking_created  Month   Day Time_of_Day  \
6   2013-01-01 09:45:00 2013-01-01 09:21:00    1.0   1.0     Morning   
17  2013-01-12 08:00:00 2013-01-01 12:17:00    1.0  12.0     Morning   
24  2013-01-05 08:30:00 2013-01-01 14:40:00    1.0   5.0     Morning   
158 2013-01-04 09:30:00 2013-01-03 12:19:00    1.0   4.0     Morning   
216 2013-01-06 07:30:00 2013-01-04 09:32:00    1.0   6.0     Morning   

    Weekday_Weekend  Hours_Before_Start  
6           Weekday            0.400000  
17          Weekend          259.716667  
24          Weekend           89.833333  
158         Weekday           21.183333  
216         Weekend           45.966667  


In [186]:
model_data = data_type_1.drop(columns=['from_date', 'booking_created'])

In [187]:
model_data = model_data.fillna(model_data.median(numeric_only=True)).fillna(model_data.mode().iloc[0])

In [188]:
label_encoders = {}
for column in model_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    model_data[column] = le.fit_transform(model_data[column])
    label_encoders[column] = le

In [189]:
X = model_data.drop(columns=['Car_Cancellation'])  # Features
y = model_data['Car_Cancellation']  # Target

#### Split the data into training and testing sets

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [192]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [193]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [194]:
y_pred = model.predict(X_test)

In [195]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9905660377358491
Confusion Matrix:
 [[314   1]
 [  2   1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       315
           1       0.50      0.33      0.40         3

    accuracy                           0.99       318
   macro avg       0.75      0.67      0.70       318
weighted avg       0.99      0.99      0.99       318



# Travel_type_id 2

In [236]:
data_type_2 = data[data['travel_type_id'] == 2]

In [238]:
data_type_2 = data_type_2.drop(columns=['package_id', 'travel_type_id', 'from_city_id', 'to_city_id'])

In [240]:
label_encoders = {}
categorical_features = ['from_area_id', 'to_area_id']

for feature in categorical_features:
    le = LabelEncoder()
    data_type_2[feature] = le.fit_transform(data_type_2[feature])
    label_encoders[feature] = le

In [242]:
def calculate_distance(row):
    coords_1 = (row['from_lat'], row['from_long'])
    coords_2 = (row['to_lat'], row['to_long'])
    return geodesic(coords_1, coords_2).kilometers

data_type_2['Distance_km'] = data_type_2.apply(calculate_distance, axis=1)

In [243]:
data_type_2 = data_type_2.drop(columns=['from_lat', 'from_long', 'to_lat', 'to_long'])

#### from 'from_date' and calculate time differences

In [246]:
data_type_2['from_date'] = pd.to_datetime(data_type_2['from_date'], errors='coerce')
data_type_2['booking_created'] = pd.to_datetime(data_type_2['booking_created'], errors='coerce')

In [248]:
data_type_2['Month'] = data_type_2['from_date'].dt.month
data_type_2['Day'] = data_type_2['from_date'].dt.day
data_type_2['Time_of_Day'] = data_type_2['from_date'].dt.hour
data_type_2['Weekday_Weekend'] = data_type_2['from_date'].dt.dayofweek

In [250]:
data_type_2['Hours_Before_Start'] = (data_type_2['from_date'] - data_type_2['booking_created']).dt.total_seconds() / 3600.0

In [252]:
data_type_2 = data_type_2.drop(columns=['from_date', 'booking_created'])

#### 'from_area_id' and 'to_area_id'

In [254]:
from_area_cancellation_rate = data_type_2.groupby('from_area_id')['Car_Cancellation'].mean() * 100
to_area_cancellation_rate = data_type_2.groupby('to_area_id')['Car_Cancellation'].mean() * 100

In [256]:
def categorize_cancellation(rate):
    if rate == 0:
        return 'No_Cancellation'
    elif rate < 30:
        return 'Low_Cancellation'
    elif 33 <= rate <= 67:
        return 'Medium_Cancellation'
    else:
        return 'High_Cancellation'

data_type_2['From_Area_Cancel_Category'] = data_type_2['from_area_id'].map(from_area_cancellation_rate).apply(categorize_cancellation)
data_type_2['To_Area_Cancel_Category'] = data_type_2['to_area_id'].map(to_area_cancellation_rate).apply(categorize_cancellation)

#### Calculate route cancellation percentage

In [262]:
route_cancellation_rate = data_type_2.groupby(['from_area_id', 'to_area_id'])['Car_Cancellation'].mean() * 100

In [264]:
data_type_2['Route_Cancellation_Rate'] = data_type_2.apply(
    lambda row: route_cancellation_rate[(row['from_area_id'], row['to_area_id'])], axis=1
)

In [266]:
data_type_2['Route_Cancel_Category'] = data_type_2['Route_Cancellation_Rate'].apply(categorize_cancellation)

In [268]:
data_type_2 = data_type_2.drop(columns=['from_area_id', 'to_area_id'])

In [272]:
new_categorical_features = ['From_Area_Cancel_Category', 'To_Area_Cancel_Category', 'Route_Cancel_Category']
for feature in new_categorical_features:
    le = LabelEncoder()
    data_type_2[feature] = le.fit_transform(data_type_2[feature])
    label_encoders[feature] = le

In [274]:
numeric_features = ['Distance_km', 'Month', 'Day', 'Hours_Before_Start', 'Route_Cancellation_Rate']

In [276]:
scaler = StandardScaler()
data_type_2[numeric_features] = scaler.fit_transform(data_type_2[numeric_features])

In [278]:
X = data_type_2.drop(columns=['Car_Cancellation'])  # Features
y = data_type_2['Car_Cancellation']  # Target variable

In [280]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [282]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [284]:
y_pred = model.predict(X_test)

In [286]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[9310  148]
 [ 356  474]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      9458
           1       0.76      0.57      0.65       830

    accuracy                           0.95     10288
   macro avg       0.86      0.78      0.81     10288
weighted avg       0.95      0.95      0.95     10288


Accuracy Score: 0.9510108864696734


# travel_type_id-3

In [291]:
data_type_3 = data[data['travel_type_id'] == 3]

In [295]:
data_type_3

Unnamed: 0,package_id,travel_type_id,from_area_id,to_area_id,from_city_id,to_city_id,from_date,online_booking,mobile_site_booking,booking_created,from_lat,from_long,to_lat,to_long,Car_Cancellation
8,2.0,3,448.0,,,,01-01-2013 16:00,0,0,01-01-2013 09:44,12.927320,77.635750,,,0
20,2.0,3,471.0,,,,01-01-2013 14:30,0,0,01-01-2013 12:52,13.018540,77.635240,,,0
23,1.0,3,1286.0,,,,01-01-2013 16:30,0,0,01-01-2013 14:39,12.973448,77.620320,,,0
34,1.0,3,1323.0,,,,01-01-2013 20:00,1,0,01-01-2013 17:25,12.869805,77.653211,,,0
47,2.0,3,142.0,,,,01-02-2013 03:00,0,0,01-01-2013 19:37,12.912810,77.609230,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43415,1.0,3,768.0,,15.0,,11/24/2013 16:00,0,1,11/24/2013 12:32,12.989990,77.553320,,,0
43417,4.0,3,1390.0,,15.0,,11/24/2013 15:00,0,0,11/24/2013 13:06,12.969368,77.641302,,,0
43420,2.0,3,1237.0,,15.0,,11/25/2013 1:00,1,0,11/24/2013 13:54,12.926450,77.612060,,,0
43424,2.0,3,515.0,,15.0,,11/24/2013 15:15,0,0,11/24/2013 14:23,12.978960,77.673450,,,0


In [297]:
data_type_3 = data_type_3.drop(columns=['to_area_id', 'from_city_id', 'to_city_id'])

In [299]:
print(data_type_3.head())

    package_id  travel_type_id  from_area_id         from_date  \
8          2.0               3         448.0  01-01-2013 16:00   
20         2.0               3         471.0  01-01-2013 14:30   
23         1.0               3        1286.0  01-01-2013 16:30   
34         1.0               3        1323.0  01-01-2013 20:00   
47         2.0               3         142.0  01-02-2013 03:00   

    online_booking  mobile_site_booking   booking_created   from_lat  \
8                0                    0  01-01-2013 09:44  12.927320   
20               0                    0  01-01-2013 12:52  13.018540   
23               0                    0  01-01-2013 14:39  12.973448   
34               1                    0  01-01-2013 17:25  12.869805   
47               0                    0  01-01-2013 19:37  12.912810   

    from_long  to_lat  to_long  Car_Cancellation  
8   77.635750     NaN      NaN                 0  
20  77.635240     NaN      NaN                 0  
23  77.620320    

In [305]:
from_area_cancellation_rate = data_type_3.groupby('from_area_id')['Car_Cancellation'].mean() * 100

In [309]:
def categorize_cancellation(rate):
    if rate == 0:
        return 'No_Cancellation'
    elif rate < 30:
        return 'Low_Cancellation'
    elif 33 <= rate <= 67:
        return 'Medium_Cancellation'
    else:
        return 'High_Cancellation'

data_type_3['From_Area_Cancel_Category'] = data_type_3['from_area_id'].map(from_area_cancellation_rate).apply(categorize_cancellation)
data_type_3 = data_type_3.drop(columns=['from_area_id'])

In [311]:
data_type_3['from_date'] = pd.to_datetime(data_type_3['from_date'], errors='coerce')
data_type_3['Month'] = data_type_3['from_date'].dt.month
data_type_3['Day'] = data_type_3['from_date'].dt.day

In [313]:
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

data_type_3['Time_of_Day'] = data_type_3['from_date'].dt.hour.apply(categorize_time_of_day)
data_type_3['Weekday_Weekend'] = data_type_3['from_date'].dt.dayofweek.apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')

In [315]:
data_type_3['booking_created'] = pd.to_datetime(data_type_3['booking_created'], errors='coerce')
data_type_3['Hours_Before_Start'] = (data_type_3['from_date'] - data_type_3['booking_created']).dt.total_seconds() / 3600

In [317]:
data_type_3 = data_type_3.drop(columns=['from_date', 'booking_created'])

In [322]:
label_encoder = LabelEncoder()
data_type_3['From_Area_Cancel_Category'] = label_encoder.fit_transform(data_type_3['From_Area_Cancel_Category'])
data_type_3['Time_of_Day'] = label_encoder.fit_transform(data_type_3['Time_of_Day'])
data_type_3['Weekday_Weekend'] = label_encoder.fit_transform(data_type_3['Weekday_Weekend'])

## features and target

In [325]:
X = data_type_3.drop(columns=['Car_Cancellation'])
y = data_type_3['Car_Cancellation']

## Split the data into training and testing

In [327]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [329]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [331]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [333]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [335]:
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 94.66%
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97      2157
           1       0.33      0.12      0.18       108

    accuracy                           0.95      2265
   macro avg       0.65      0.55      0.57      2265
weighted avg       0.93      0.95      0.93      2265



# END