# Synthetic Feature Generation for Ride-Hailing Dataset
This notebook generates missing behavioral and incentive-based features required for DWF modeling.


In [None]:
# Import Libraries
import pandas as pd
import numpy as np


In [None]:
# Load Preprocessed Dataset
df = pd.read_csv('preprocessed_ride_hailing_dataset.csv')


In [3]:
df.head()

Unnamed: 0,Pickup Location,Request to Pickup,Hour of Day,Time of Day,Month of Year
0,91,316.0,8,morning,1
1,177,356.0,16,afternoon,10
2,80,556.0,3,night,8
3,196,423.0,17,evening,7
4,165,155.0,21,night,1


In [None]:
# Surge Pricing Indicator
df['Surge Pricing Indicator'] = df['Time of Day'].apply(lambda x: 1 if x in ['morning', 'evening'] else 0)



In [None]:
# DWF Reward Applied (based on 'Request to Pickup' in seconds)
df['DWF Reward Applied'] = np.where(
    df['Request to Pickup'] > 420,
    np.round(np.random.uniform(1, 5, len(df)), 2),
    0.0
)

In [None]:
# Historical Demand Forecast
# (based on Gupta et al., 2021)

# Normalize hour and surge independently
hour_scaled = df['Hour of Day'] / 23
surge_scaled = (df['Surge Pricing Indicator'] - df['Surge Pricing Indicator'].min()) / (df['Surge Pricing Indicator'].max() - df['Surge Pricing Indicator'].min())

# Combine both with equal weight (as often implied in literature)
df['Historical Demand Forecast'] = hour_scaled + surge_scaled + np.random.normal(0, 0.05, len(df))

# Normalize the combined forecast to [0, 1]
df['Historical Demand Forecast'] = (df['Historical Demand Forecast'] - df['Historical Demand Forecast'].min()) / (df['Historical Demand Forecast'].max() - df['Historical Demand Forecast'].min())


In [None]:

# Synthetic Feature Generation for RPI, DPI, CR

import numpy as np
from scipy.special import expit  # sigmoid

# RPI = 1 - normalized(wait_time) + noise
df['RPI'] = 1 - (df['Request to Pickup'].rank(pct=True)) + np.random.normal(0, 0.05, len(df))
df['RPI'] = df['RPI'].clip(0, 1)

# DPI = (incentive + fare_adjustment) / (pickup_distance × penalty) + noise
if 'incentive' not in df.columns:
    df['incentive'] = np.random.uniform(0, 5, len(df))
if 'fare_adjustment' not in df.columns:
    df['fare_adjustment'] = np.random.uniform(-0.15, 0.15, len(df))

# Driver Patience Index (DPI): incentive-based tolerance to wait time
df['DPI'] = (df['incentive'] + df['fare_adjustment']) / (
    df['Request to Pickup'].replace(0, 1)
) + np.random.normal(0, 0.05, len(df))
df['DPI'] = df['DPI'].clip(0, 1)


# Normalize delay to [0, 1] before using in sigmoid
pickup_scaled = (df['Request to Pickup'] - df['Request to Pickup'].min()) / \
                (df['Request to Pickup'].max() - df['Request to Pickup'].min())

# CR = sigmoid(weighted sum of normalized delay, RPI, DPI)
df['CR'] = expit(
    1.2 * pickup_scaled - 0.8 * df['RPI'] - 0.6 * df['DPI']
)


In [None]:
# Save Synthetic Dataset
df.to_csv('synthetic_ride_hailing_dataset.csv', index=False)
print('Synthetic features generated and saved.')

✅ Synthetic features generated and saved.


In [11]:
df.tail(1000)

Unnamed: 0,Pickup Location,Request to Pickup,Hour of Day,Time of Day,Month of Year,Surge Pricing Indicator,DWF Reward Applied,Historical Demand Forecast,RPI,incentive,fare_adjustment,pickup_distance,DPI,CR
49000,237,35.0,9,morning,7,1,0.00,0.709301,0.932939,2.424637,0.027849,2.918149,0.000000,0.420207
49001,51,2729.0,19,evening,4,1,2.43,0.937720,0.097060,1.695361,-0.042520,8.892408,0.068815,0.711350
49002,148,109.0,19,evening,1,1,0.00,0.914984,0.917373,3.479182,0.083907,9.897515,0.045750,0.420545
49003,198,254.0,22,night,12,0,0.00,0.513270,0.514033,4.736913,-0.107947,3.085629,0.000000,0.515417
49004,78,333.0,13,afternoon,2,0,0.00,0.341190,0.255891,0.538363,-0.059753,2.909022,0.000000,0.570941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,210,242.0,1,night,4,0,0.00,0.082577,0.610011,4.046645,-0.134505,2.237364,0.000000,0.495562
49996,107,477.0,8,morning,5,1,4.74,0.741143,0.144078,3.792134,0.040818,2.723214,0.000000,0.600374
49997,48,213.0,12,afternoon,2,0,0.00,0.318101,0.575312,3.007543,0.088829,5.501042,0.091212,0.487218
49998,101,274.0,23,night,4,0,0.00,0.486897,0.474388,3.434338,0.025823,5.136392,0.059127,0.515584
