# Modeling Pipeline: DWF Optimization with RL & Demand Forecasting
This notebook implements a Deep Reinforcement Learning model to dynamically optimize fare incentives using preprocessed and enriched ride-hailing data.

In [1]:
# Imports
import pandas as pd
import numpy as np
import gym
from gym import spaces
from sklearn.preprocessing import LabelEncoder
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [2]:
# Load  Dataset
df = pd.read_csv("train_data.csv")

In [3]:
# Label encode categorical features
df['Time of Day'] = LabelEncoder().fit_transform(df['Time of Day'])
df['Month of Year'] = LabelEncoder().fit_transform(df['Month of Year'])
df['Pickup Location'] = LabelEncoder().fit_transform(df['Pickup Location'])

print("Dataset loaded and encoded.")
print(df.head())

Dataset loaded and encoded.
   Pickup Location  Request to Pickup  Hour of Day  Time of Day  \
0               29              327.0           16            0   
1              187              253.0            8            2   
2               33              336.0           13            0   
3              178              102.0           13            0   
4              129              473.0            4            3   

   Month of Year  Surge Pricing Indicator  DWF Reward Applied  \
0             11                        0                0.00   
1              7                        1                0.00   
2              5                        0                0.00   
3              6                        0                0.00   
4              3                        0                3.76   

   Historical Demand Forecast       RPI  incentive  fare_adjustment       DPI  \
0                    0.373889  0.257379   3.530438        -0.112454  0.017936   
1               

##  Define RL Environment

In [4]:
# class RideHailingEnv(gym.Env):
#     def __init__(self, df):
#         super(RideHailingEnv, self).__init__()
#         self.df = df.reset_index(drop=True)
#         self.current_idx = 0

#         self.observation_space = spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)
#         self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

#     def reset(self):
#         self.current_idx = 0
#         return self._get_observation()

#     def step(self, action):
#         if self.current_idx >= len(self.df) - 1:
#             return self._get_observation(), 0.0, True, {}

#         row = self.df.iloc[self.current_idx]
#         fare_adjustment, rider_incentive = action

#         # Reward logic
#         ride_completed = row['CR'] < 0.5
#         reward = 1.0 if ride_completed else -1.0
#         if rider_incentive > 3.0:
#             reward -= 0.5
#         if fare_adjustment < -0.10:
#             reward -= 0.3

#         self.current_idx += 1
#         done = self.current_idx >= len(self.df)
#         return self._get_observation(), reward, done, {}

#     def _get_observation(self):
#         if self.current_idx >= len(self.df):  
#             return np.zeros(self.observation_space.shape, dtype=np.float32)

#         row = self.df.iloc[self.current_idx]
#         return np.array([
#             row['Pickup Location'],
#             row['Request to Pickup'],
#             row['Time of Day'],
#             row['Month of Year'],
#             row['RPI'],
#             row['DPI'],
#             row['CR'],
#             row['Historical Demand Forecast']
#         ], dtype=np.float32)


In [5]:
import numpy as np
import gym
from gym import spaces

class RideHailingEnv(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0

        self.observation_space = spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action

        # --- Reactive Cancellation Mechanism ---
        prob_cancel = row['CR']  # originally derived cancellation risk

        # Reduce probability based on the action taken
        if rider_incentive > 2.5:
            prob_cancel *= 0.7  # higher incentive makes cancellation less likely
        if fare_adjustment > 0.1:
            prob_cancel *= 0.8  # higher fare adjustment (increase) also helps

        # Simulate cancellation event based on modified probability
        cancelled = np.random.rand() < prob_cancel

        # --- Reward Logic ---
        reward = 1.0 if not cancelled else -1.0

        # Apply cost penalties (only if you're including them in this version)
        if rider_incentive > 3.0:
            reward -= 0.5
        if fare_adjustment < -0.10:
            reward -= 0.3

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df):
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year'],
            row['RPI'],
            row['DPI'],
            row['CR'],
            row['Historical Demand Forecast']
        ], dtype=np.float32)


## Train the PPO Model

In [6]:
# Custom Neural Network Architecture
policy_kwargs = dict(net_arch=[64, 64])  # Two hidden layers with 64 neurons each

In [7]:
# Retrain Model with Updated Observation Space (8 features)
env = DummyVecEnv([lambda: RideHailingEnv(df)])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)

# ⏱Increase Training Time
model.learn(total_timesteps=200000)  # More steps = better convergence

# Save as new version
model.save("dwf_rl_pricing_model_v6")  # Updated version aligned with your final env



Using cpu device


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


-----------------------------
| time/              |      |
|    fps             | 1385 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 994         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006237732 |
|    clip_fraction        | 0.0573      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.83       |
|    explained_variance   | -0.0189     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.46        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00557    |
|    std                  | 0.993       |
|    value_loss           | 9.63        |
----------------------------------

## Inference & Real-Time Decision Example

In [1]:
# Load model and predict for a sample state
model = PPO.load("dwf_rl_pricing_model_v6")
sample_env = RideHailingEnv(df)
obs = sample_env.reset()
action, _states = model.predict(obs)
print(f"Suggested Fare Adjustment: {action[0]*100:.2f}%")
print(f"Suggested Rider Incentive: ${action[1]:.2f}")

NameError: name 'PPO' is not defined

Baseline RL

In [9]:
import gym
import numpy as np
from gym import spaces

class RideHailingEnv_Baseline(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv_Baseline, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0

        # Observation: 4 basic context features
        self.observation_space = spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action

        # --- Basic dynamic cancellation logic ---
        prob_cancel = row['CR']
        if rider_incentive > 2.5:
            prob_cancel *= 0.7
        if fare_adjustment > 0.1:
            prob_cancel *= 0.8

        cancelled = np.random.rand() < prob_cancel
        reward = 1.0 if not cancelled else -1.0

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df): 
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year']
        ], dtype=np.float32)
    
env = DummyVecEnv([lambda: RideHailingEnv_Baseline(df)])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)
model.learn(total_timesteps=200000)
model.save("dwf_rl_baseline_model")




Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1448 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1020        |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004704655 |
|    clip_fraction        | 0.0337      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.85       |
|    explained_variance   | 0.00164     |
|    learning_rate        | 0.0003      |
|    loss                 | 4.82        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00318    |
|    std                  | 1.01        |
|    value_loss           | 9.8         |
-----------------

RPI

In [10]:
# Define RideHailingEnv_RPI with safe observation handling
class RideHailingEnv_RPI(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv_RPI, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0
        self.observation_space = spaces.Box(low=0, high=1, shape=(5,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action
        reward = 1.0 if row['CR'] < 0.5 else -1.0
        # if rider_incentive > 3.0:
        #     reward -= 0.5
        # if fare_adjustment < -0.10:
        #     reward -= 0.3

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df): 
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year'],
            row['RPI']
        ], dtype=np.float32)
    
env = DummyVecEnv([lambda: RideHailingEnv_RPI(df)])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)
model.learn(total_timesteps=200000)
model.save("dwf_rl_RPI_model")


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1463 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1027         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0052859094 |
|    clip_fraction        | 0.0492       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.82        |
|    explained_variance   | 0.0202       |
|    learning_rate        | 0.0003       |
|    loss                 | 3.89         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00585     |
|    std                  | 0.988        |
|    value_loss           | 9.87         |

RPI + DPI

In [11]:
class RideHailingEnv_RPI_DPI(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv_RPI_DPI, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0
        self.observation_space = spaces.Box(low=0, high=1, shape=(6,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action
        reward = 1.0 if row['CR'] < 0.5 else -1.0
        # if rider_incentive > 3.0: reward -= 0.5
        # if fare_adjustment < -0.10: reward -= 0.3

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df):
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year'],
            row['RPI'],
            row['DPI']
        ], dtype=np.float32)

    
env = DummyVecEnv([lambda: RideHailingEnv_RPI_DPI(df)])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)
model.learn(total_timesteps=200000)
model.save("dwf_rl_RPI_DPI_model")


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1231 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 885          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0033737223 |
|    clip_fraction        | 0.0354       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.84        |
|    explained_variance   | -0.00335     |
|    learning_rate        | 0.0003       |
|    loss                 | 4.36         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00305     |
|    std                  | 1            |
|    value_loss           | 9.94         |

RPI + DPI + CR

In [12]:
class RideHailingEnv_RPI_DPI_CR(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv_RPI_DPI_CR, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0
        self.observation_space = spaces.Box(low=0, high=1, shape=(7,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action
        reward = 1.0 if row['CR'] < 0.5 else -1.0
        # if rider_incentive > 3.0: reward -= 0.5
        # if fare_adjustment < -0.10: reward -= 0.3

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df):
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year'],
            row['RPI'],
            row['DPI'],
            row['CR']
        ], dtype=np.float32)


    
env = DummyVecEnv([lambda: RideHailingEnv_RPI_DPI_CR(df)])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)
model.learn(total_timesteps=200000)
model.save("dwf_rl_RPI_DPI_CR_model")


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1362 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 992          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0065879524 |
|    clip_fraction        | 0.0492       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.84        |
|    explained_variance   | -0.0602      |
|    learning_rate        | 0.0003       |
|    loss                 | 5.54         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00502     |
|    std                  | 0.997        |
|    value_loss           | 9.63         |

HDF

In [13]:
class RideHailingEnv_HDF(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv_HDF, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0
        self.observation_space = spaces.Box(low=0, high=1, shape=(5,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action
        reward = 1.0 if row['CR'] < 0.5 else -1.0
        # if rider_incentive > 3.0: reward -= 0.5
        # if fare_adjustment < -0.10: reward -= 0.3

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df):
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year'],
            row['Historical Demand Forecast']
        ], dtype=np.float32)


    
env = DummyVecEnv([lambda: RideHailingEnv_HDF(df)])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)
model.learn(total_timesteps=200000)
model.save("dwf_rl_HDF_model")


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1263 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 929         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005183404 |
|    clip_fraction        | 0.0384      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.84       |
|    explained_variance   | -0.0183     |
|    learning_rate        | 0.0003      |
|    loss                 | 4.74        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00439    |
|    std                  | 0.998       |
|    value_loss           | 9.62        |
-----------------