# Synthetic Feature Generation for Ride-Hailing Dataset
This notebook generates missing behavioral and incentive-based features required for DWF modeling.


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder 


In [4]:
# Load Preprocessed Dataset
df = pd.read_csv('datasets/preprocessed_ride_hailing_dataset.csv')


In [5]:
df.head()

Unnamed: 0,Pickup Location,Request to Pickup,Hour of Day,Time of Day,Month of Year
0,91,316.0,8,morning,1
1,177,356.0,16,afternoon,10
2,80,556.0,3,night,8
3,196,423.0,17,evening,7
4,165,155.0,21,night,1


In [6]:
# Surge Pricing Indicator
df['Surge Pricing Indicator'] = df['Time of Day'].apply(lambda x: 1 if x in ['morning', 'evening'] else 0)



In [8]:
# Historical Demand Forecast
# (based on Gupta et al., 2021)

# Normalize hour and surge independently
hour_scaled = df['Hour of Day'] / 23
surge_scaled = (df['Surge Pricing Indicator'] - df['Surge Pricing Indicator'].min()) / (df['Surge Pricing Indicator'].max() - df['Surge Pricing Indicator'].min())

# Combine both with equal weight (as often implied in literature)
df['Historical Demand Forecast'] = hour_scaled + surge_scaled + np.random.normal(0, 0.05, len(df))

# Normalize the combined forecast to [0, 1]
df['Historical Demand Forecast'] = (df['Historical Demand Forecast'] - df['Historical Demand Forecast'].min()) / (df['Historical Demand Forecast'].max() - df['Historical Demand Forecast'].min())


In [None]:
# Random Base Fare (e.g., $5 to $20)
np.random.seed(42) 
df['Base Fare'] = np.round(np.random.uniform(5, 20, size=len(df)), 2)

# Label encode categorical features
df['Time of Day'] = LabelEncoder().fit_transform(df['Time of Day'])
df['Month of Year'] = LabelEncoder().fit_transform(df['Month of Year'])
df['Pickup Location'] = LabelEncoder().fit_transform(df['Pickup Location'])

In [9]:
import numpy as np
from scipy.special import expit  # sigmoid

# Add random incentive and fare_adjustment if not present
if 'incentive' not in df.columns:
    df['incentive'] = 0.0  # simulate no intervention
if 'fare_adjustment' not in df.columns:
    df['fare_adjustment'] = 0.0  # simulate no intervention

# Rider Patience Index (RPI): inverse of pickup delay rank + noise
pickup_rank = df['Request to Pickup'].rank(pct=True)
epsilon_rpi = np.random.normal(loc=0.0, scale=0.02, size=len(df))
df['RPI'] = np.clip(1.0 - pickup_rank + epsilon_rpi, 0.0, 1.0)

# Driver Patience Index (DPI): based on action signal + noise
action_signal = df['incentive'] + abs(df['fare_adjustment'] * 10)  # assuming base_fare ≈ 10
epsilon_dpi = np.random.normal(loc=0.0, scale=0.02, size=len(df))
df['DPI'] = np.clip(1.0 - np.exp(-0.6 * action_signal) + epsilon_dpi, 0.0, 1.0)

# Normalize pickup delay for model use
pickup_scaled = (df['Request to Pickup'] - df['Request to Pickup'].min()) / \
                (df['Request to Pickup'].max() - df['Request to Pickup'].min())

# Final Cancellation Rate (CR) with your updated logic
df['CR'] = expit(
    1.2 * pickup_scaled - 1.3 * df['RPI'] - 2.0 * df['DPI']
)


In [10]:
# Save Synthetic Dataset
df.to_csv('synthetic_ride_hailing_dataset.csv', index=False)
print('Synthetic features generated and saved.')

Synthetic features generated and saved.


In [12]:
df.head(1000)

Unnamed: 0,Pickup Location,Request to Pickup,Hour of Day,Time of Day,Month of Year,Surge Pricing Indicator,Historical Demand Forecast,incentive,fare_adjustment,RPI,DPI,CR
0,91,316.0,8,morning,1,1,0.679722,0.0,0.0,0.360737,0.000000,0.504421
1,177,356.0,16,afternoon,10,0,0.415516,0.0,0.0,0.251755,0.020995,0.531515
2,80,556.0,3,night,8,0,0.115999,0.0,0.0,0.061218,0.034148,0.596740
3,196,423.0,17,evening,7,1,0.853097,0.0,0.0,0.203734,0.028678,0.546892
4,165,155.0,21,night,1,0,0.472339,0.0,0.0,0.790407,0.000000,0.359731
...,...,...,...,...,...,...,...,...,...,...,...,...
995,255,201.0,3,night,9,0,0.116706,0.0,0.0,0.673206,0.000000,0.397956
996,229,222.0,20,night,6,0,0.459187,0.0,0.0,0.622755,0.002533,0.413669
997,34,603.0,18,evening,10,1,0.883435,0.0,0.0,0.073676,0.009704,0.607066
998,142,225.0,0,night,3,0,0.063214,0.0,0.0,0.611138,0.000000,0.418731


Full Script

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from scipy.special import expit
from sklearn.preprocessing import LabelEncoder 

# Load Preprocessed Dataset
df = pd.read_csv('datasets/preprocessed_ride_hailing_dataset.csv')

# Surge Pricing Indicator
df['Surge Pricing Indicator'] = df['Time of Day'].apply(lambda x: 1 if x in ['morning', 'evening'] else 0)

# Normalize hour and surge independently
hour_scaled = df['Hour of Day'] / 23
surge_scaled = (df['Surge Pricing Indicator'] - df['Surge Pricing Indicator'].min()) / (df['Surge Pricing Indicator'].max() - df['Surge Pricing Indicator'].min())

# Combine both with equal weight (as often implied in literature)
df['Historical Demand Forecast'] = hour_scaled + surge_scaled + np.random.normal(0, 0.05, len(df))

# Normalize the combined forecast to [0, 1]
df['Historical Demand Forecast'] = (df['Historical Demand Forecast'] - df['Historical Demand Forecast'].min()) / (df['Historical Demand Forecast'].max() - df['Historical Demand Forecast'].min())

# Random Base Fare (e.g., $5 to $20)
np.random.seed(42) 
df['Base Fare'] = np.round(np.random.uniform(5, 20, size=len(df)), 2)

# Label encode categorical features
df['Time of Day'] = LabelEncoder().fit_transform(df['Time of Day'])
df['Month of Year'] = LabelEncoder().fit_transform(df['Month of Year'])
df['Pickup Location'] = LabelEncoder().fit_transform(df['Pickup Location'])

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

if 'incentive' not in df.columns:
    df['incentive'] = 0.0  # simulate no intervention
if 'fare_adjustment' not in df.columns:
    df['fare_adjustment'] = 0.0  # simulate no intervention

# Rider Patience Index (RPI)
pickup_rank = df['Request to Pickup'].rank(pct=True)
epsilon_rpi = np.random.normal(loc=0.0, scale=0.02, size=len(df))
df['RPI'] = np.clip(1.0 - pickup_rank + epsilon_rpi, 0.0, 1.0)

# Driver Patience Index (DPI)
action_signal = df['incentive'] + abs(df['fare_adjustment'] * df['Base Fare']) 
epsilon_dpi = np.random.normal(loc=0.0, scale=0.02, size=len(df))
df['DPI'] = np.clip(1.0 - np.exp(-0.6 * action_signal) + epsilon_dpi, 0.0, 1.0)

# Normalize pickup delay for model use
pickup_scaled = (df['Request to Pickup'] - df['Request to Pickup'].min()) / \
                (df['Request to Pickup'].max() - df['Request to Pickup'].min())

# Final Cancellation Rate (CR)
df['CR'] = expit(
    1.2 * pickup_scaled - 1.3 * df['RPI'] - 2.0 * df['DPI']
)

# Save Synthetic Dataset
df.to_csv('synthetic_ride_hailing_dataset.csv', index=False)