# Synthetic Feature Generation for Ride-Hailing Dataset
This notebook generates missing behavioral and incentive-based features required for DWF modeling.


In [2]:
# Import Libraries
import pandas as pd
import numpy as np


In [3]:
# Load Preprocessed Dataset
df = pd.read_csv('preprocessed_ride_hailing_dataset.csv')


In [4]:
df.head()

Unnamed: 0,Pickup Location,Request to Pickup,Hour of Day,Time of Day,Month of Year
0,91,316.0,8,morning,1
1,177,356.0,16,afternoon,10
2,80,556.0,3,night,8
3,196,423.0,17,evening,7
4,165,155.0,21,night,1


In [5]:
# Surge Pricing Indicator
df['Surge Pricing Indicator'] = df['Time of Day'].apply(lambda x: 1 if x in ['morning', 'evening'] else 0)



In [None]:
# # DWF Reward Applied (based on 'Request to Pickup' in seconds)
# df['DWF Reward Applied'] = np.where(
#     df['Request to Pickup'] > 420,
#     np.round(np.random.uniform(1, 5, len(df)), 2),
#     0.0
# )

In [7]:
# Historical Demand Forecast
# (based on Gupta et al., 2021)

# Normalize hour and surge independently
hour_scaled = df['Hour of Day'] / 23
surge_scaled = (df['Surge Pricing Indicator'] - df['Surge Pricing Indicator'].min()) / (df['Surge Pricing Indicator'].max() - df['Surge Pricing Indicator'].min())

# Combine both with equal weight (as often implied in literature)
df['Historical Demand Forecast'] = hour_scaled + surge_scaled + np.random.normal(0, 0.05, len(df))

# Normalize the combined forecast to [0, 1]
df['Historical Demand Forecast'] = (df['Historical Demand Forecast'] - df['Historical Demand Forecast'].min()) / (df['Historical Demand Forecast'].max() - df['Historical Demand Forecast'].min())


In [8]:

# Synthetic Feature Generation for RPI, DPI, CR

import numpy as np
from scipy.special import expit  # sigmoid

# RPI = 1 - normalized(wait_time) + noise
df['RPI'] = 1 - (df['Request to Pickup'].rank(pct=True)) + np.random.normal(0, 0.05, len(df))
df['RPI'] = df['RPI'].clip(0, 1)

# DPI = (incentive + fare_adjustment) / (pickup_distance × penalty) + noise
if 'incentive' not in df.columns:
    df['incentive'] = np.random.uniform(0, 5, len(df))
if 'fare_adjustment' not in df.columns:
    df['fare_adjustment'] = np.random.uniform(-0.15, 0.15, len(df))

# Driver Patience Index (DPI): incentive-based tolerance to wait time
df['DPI'] = (df['incentive'] + df['fare_adjustment']) / (
    df['Request to Pickup'].replace(0, 1)
) + np.random.normal(0, 0.05, len(df))
df['DPI'] = df['DPI'].clip(0, 1)


# Normalize delay to [0, 1] before using in sigmoid
pickup_scaled = (df['Request to Pickup'] - df['Request to Pickup'].min()) / \
                (df['Request to Pickup'].max() - df['Request to Pickup'].min())

# CR = sigmoid(weighted sum of normalized delay, RPI, DPI)
df['CR'] = expit(
    1.2 * pickup_scaled - 0.8 * df['RPI'] - 0.6 * df['DPI']
)


In [9]:
# Save Synthetic Dataset
df.to_csv('synthetic_ride_hailing_dataset.csv', index=False)
print('Synthetic features generated and saved.')

Synthetic features generated and saved.


In [10]:
df.tail(1000)

Unnamed: 0,Pickup Location,Request to Pickup,Hour of Day,Time of Day,Month of Year,Surge Pricing Indicator,DWF Reward Applied,Historical Demand Forecast,RPI,incentive,fare_adjustment,DPI,CR
49000,237,35.0,9,morning,7,1,0.00,0.690703,0.989995,3.573314,0.031984,0.041266,0.403157
49001,51,2729.0,19,evening,4,1,2.17,0.898553,0.035990,3.236050,-0.043824,0.000000,0.729501
49002,148,109.0,19,evening,1,1,0.00,0.899317,0.915366,0.240214,0.068872,0.000000,0.427641
49003,198,254.0,22,night,12,0,0.00,0.527954,0.507691,2.777629,-0.015480,0.012619,0.514793
49004,78,333.0,13,afternoon,2,0,0.00,0.341017,0.378936,3.832374,0.065889,0.000000,0.546678
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,210,242.0,1,night,4,0,0.00,0.109886,0.467346,0.640563,0.104698,0.000000,0.524077
49996,107,477.0,8,morning,5,1,3.15,0.691955,0.009332,4.647895,0.017870,0.004620,0.625287
49997,48,213.0,12,afternoon,2,0,0.00,0.306183,0.641381,0.288990,0.081145,0.049754,0.480230
49998,101,274.0,23,night,4,0,0.00,0.546826,0.438355,3.795544,0.149820,0.052807,0.523726
