# Modeling Pipeline: DWF Optimization with RL & Demand Forecasting
This notebook implements a Deep Reinforcement Learning model to dynamically optimize fare incentives using preprocessed and enriched ride-hailing data.

In [2]:
# Imports
import pandas as pd
import numpy as np
import gym
from gym import spaces
from sklearn.preprocessing import LabelEncoder
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [3]:
# Load  Dataset
df = pd.read_csv("synthetic_ride_hailing_dataset.csv")

In [4]:
# Label encode categorical features
df['Time of Day'] = LabelEncoder().fit_transform(df['Time of Day'])
df['Month of Year'] = LabelEncoder().fit_transform(df['Month of Year'])
df['Pickup Location'] = LabelEncoder().fit_transform(df['Pickup Location'])

print("Dataset loaded and encoded.")
print(df.head())

Dataset loaded and encoded.
   Pickup Location  Request to Pickup  Hour of Day  Time of Day  \
0               88              316.0            8            2   
1              169              356.0           16            0   
2               77              556.0            3            3   
3              188              423.0           17            1   
4              157              155.0           21            3   

   Month of Year  Surge Pricing Indicator  DWF Reward Applied  \
0              0                        1                0.00   
1              9                        0                0.00   
2              7                        0                3.12   
3              6                        1                4.08   
4              0                        0                0.00   

   Historical Demand Forecast       RPI  incentive  fare_adjustment       DPI  \
0                    0.656204  0.344730   1.047148         0.089986  0.013614   
1               

##  Define RL Environment

In [5]:
# Define RideHailingEnv
class RideHailingEnv(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0

        self.observation_space = spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action

        # Reward logic
        ride_completed = row['CR'] < 0.5
        reward = 1.0 if ride_completed else -1.0
        if rider_incentive > 3.0:
            reward -= 0.5
        if fare_adjustment < -0.10:
            reward -= 0.3

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        row = self.df.iloc[self.current_idx]
        obs = np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year'],
            row['RPI'],
            row['DPI'],
            row['CR'],
            row['Historical Demand Forecast']
        ], dtype=np.float32)
        return obs


## Train the PPO Model

In [6]:
# Custom Neural Network Architecture
policy_kwargs = dict(net_arch=[64, 64])  # Two hidden layers with 64 neurons each

# Retrain Model with Updated Observation Space (12 features)
env = DummyVecEnv([lambda: RideHailingEnv(df)])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)

# ⏱Increase Training Time
model.learn(total_timesteps=200000)  # More steps = better convergence

# Save as new version
model.save("dwf_rl_pricing_model_v6")  # Updated version aligned with your final env



Using cpu device


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


-----------------------------
| time/              |      |
|    fps             | 1359 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 986         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007720041 |
|    clip_fraction        | 0.0935      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.83       |
|    explained_variance   | -0.00387    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.78        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00882    |
|    std                  | 0.997       |
|    value_loss           | 9.97        |
----------------------------------

## Inference & Real-Time Decision Example

In [7]:
# Load model and predict for a sample state
model = PPO.load("dwf_rl_pricing_model_v6")
sample_env = RideHailingEnv(df)
obs = sample_env.reset()
action, _states = model.predict(obs)
print(f"Suggested Fare Adjustment: {action[0]*100:.2f}%")
print(f"Suggested Rider Incentive: ${action[1]:.2f}")

Suggested Fare Adjustment: 15.00%
Suggested Rider Incentive: $0.98


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
