In [18]:
# Load Libraries
import gym
from gym import spaces
import pandas as pd
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import os

In [19]:
# Load your CSV
df = pd.read_csv("train_data.csv")

# Add a 'Base Fare' column with realistic values (e.g., $5 to $20)
np.random.seed(42)  # for reproducibility
df['Base Fare'] = np.round(np.random.uniform(5, 20, size=len(df)), 2)

In [20]:
# Label encode categorical features
df['Time of Day'] = LabelEncoder().fit_transform(df['Time of Day'])
df['Month of Year'] = LabelEncoder().fit_transform(df['Month of Year'])
df['Pickup Location'] = LabelEncoder().fit_transform(df['Pickup Location'])

print("Dataset loaded and encoded.")
print(df.head())

Dataset loaded and encoded.
   Pickup Location  Request to Pickup  Hour of Day  Time of Day  \
0               29              327.0           16            0   
1              187              253.0            8            2   
2               33              336.0           13            0   
3              178              102.0           13            0   
4              129              473.0            4            3   

   Month of Year  Surge Pricing Indicator  DWF Reward Applied  \
0             11                        0                0.00   
1              7                        1                0.00   
2              5                        0                0.00   
3              6                        0                0.00   
4              3                        0                3.76   

   Historical Demand Forecast       RPI  incentive  fare_adjustment       DPI  \
0                    0.373889  0.257379   3.530438        -0.112454  0.017936   
1               

In [None]:
class RideHailingEnv(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0

        # Normalized 8-D state vector
        self.observation_space = spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)

        # Action: [fare_adjustment %, rider_incentive $]
        self.action_space = spaces.Box(
            low=np.array([-0.15, 0.0]),
            high=np.array([0.15, 5.0]),
            dtype=np.float32
        )

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        # Extract current ride
        row = self.df.iloc[self.current_idx]

        # Parse action components
        fare_adjustment, rider_incentive = action
        fare_adjustment = np.clip(fare_adjustment, -0.15, 0.15)
        rider_incentive = np.clip(rider_incentive, 0.0, 5.0)

        # Simulated outcome logic
        ride_completed = row['CR'] < 0.5  # Simplified rule based on synthetic CR

        # Cost-aware reward function
        base_fare = 10.0
        base_fare = row['Base Fare'] if 'Base Fare' in row else 10.0  # fallback
        delta = 0.2 * base_fare  # scaling for fare adjustment
        beta = 2.0               # fixed penalty for cancellation

        reward = -(rider_incentive + fare_adjustment * delta)
        if not ride_completed:
            reward -= beta

        self.current_idx += 1
        done = self.current_idx >= len(self.df)

        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df): 
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year'],
            row['RPI'],
            row['DPI'],
            row['CR'],
            row['Historical Demand Forecast']
        ], dtype=np.float32)

    

#     # Retrain Model with Updated Observation Space (8 features)
# policy_kwargs = dict(net_arch=[64, 64])
# env = DummyVecEnv([lambda: RideHailingEnv(df)])
# model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)

# # ⏱Increase Training Time
# model.learn(total_timesteps=400000)  # More steps = better convergence

# # Save as new version
# model.save("dwf_rl_pricing_model_v6")  # Updated version aligned with your final env


In [22]:
import gym
import numpy as np
from gym import spaces

class RideHailingEnv_Baseline(gym.Env):
    def __init__(self, df):
        super(RideHailingEnv_Baseline, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_idx = 0

        # Observation: 4 basic context features
        self.observation_space = spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.15, 0.0]), high=np.array([0.15, 5.0]), dtype=np.float32)

    def reset(self):
        self.current_idx = 0
        return self._get_observation()

    def step(self, action):
        if self.current_idx >= len(self.df) - 1:
            return self._get_observation(), 0.0, True, {}

        row = self.df.iloc[self.current_idx]
        fare_adjustment, rider_incentive = action

        # --- Basic dynamic cancellation logic ---
        prob_cancel = row['CR']
        if rider_incentive > 2.5:
            prob_cancel *= 0.7
        if fare_adjustment > 0.1:
            prob_cancel *= 0.8

        cancelled = np.random.rand() < prob_cancel
        reward = 1.0 if not cancelled else -1.0

        self.current_idx += 1
        done = self.current_idx >= len(self.df)
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_idx >= len(self.df): 
            return np.zeros(self.observation_space.shape, dtype=np.float32)

        row = self.df.iloc[self.current_idx]
        return np.array([
            row['Pickup Location'],
            row['Request to Pickup'],
            row['Time of Day'],
            row['Month of Year']
        ], dtype=np.float32)


# env = DummyVecEnv([lambda: RideHailingEnv_Baseline(df)])
# model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs)
# model.learn(total_timesteps=400000)
# model.save("dwf_rl_baseline_model")

In [35]:
# === Load dataset ===
df = pd.read_csv("test_data.csv")

In [36]:
np.random.seed(43)  # for reproducibility
df['Base_Fare'] = np.round(np.random.uniform(5, 20, size=len(df)), 2)

In [38]:
from sklearn.preprocessing import LabelEncoder

# Encode only if not already encoded
if df['Time of Day'].dtype == 'object':
    df['Time of Day'] = LabelEncoder().fit_transform(df['Time of Day'])

if df['Month of Year'].dtype == 'object':
    df['Month of Year'] = LabelEncoder().fit_transform(df['Month of Year'])

if df['Pickup Location'].dtype == 'object':
    df['Pickup Location'] = LabelEncoder().fit_transform(df['Pickup Location'])


In [45]:
from stable_baselines3 import PPO
import pandas as pd
import numpy as np


# === Load the trained PPO model ===
model = PPO.load("dwf_rl_baseline_model.zip")

# === Create the environment ===
env = RideHailingEnv_Baseline(df.copy())  # or RideHailingEnv_Baseline

base_fare = env.df.iloc[env.current_idx]["Base_Fare"]

# === Run the model over the dataset and collect actions ===
incentives = 0
fare_adjustments = 0
completions = 0

obs = env.reset()
done = False

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    print(f"Obs: {obs}")
    print(f"Action: {action}")
    print(f"Reward: {reward}")

    incentives = incentives + base_fare*action[0]
    fare_adjustments = fare_adjustments + action[1]
   
average_incentive = np.mean(incentives)
average_discount = np.mean(fare_adjustments)


# === Print Summary ===
print(f"\n📊 Evaluation Results:")
print(f"✅ Avg Incentive: ${average_incentive:.2f}")
print(f"✅ Avg Discount: ${average_discount:.2f}")



Obs: [ 42. 505.   3.   1.]
Action: [0.15 0.  ]
Reward: -1.0
Obs: [ 74. 419.   2.   7.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [161. 141.   3.   6.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [219. 233.   0.   6.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [132. 274.   3.   1.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [106. 169.   3.   4.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [162. 148.   2.   6.]
Action: [0.15 0.  ]
Reward: -1.0
Obs: [ 229. -614.    1.   11.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [247. 318.   2.  12.]
Action: [-0.15  0.  ]
Reward: -1.0
Obs: [129. 130.   0.   1.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [236. 238.   0.   7.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [235. 453.   3.   4.]
Action: [0.15 0.  ]
Reward: -1.0
Obs: [227. 195.   2.   2.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [145. 203.   1.   1.]
Action: [0.15 0.  ]
Reward: -1.0
Obs: [262. 279.   1.   7.]
Action: [0.15 0.  ]
Reward: -1.0
Obs: [ 25. 260.   3.   3.]
Action: [0.15 0.  ]
Reward: 1.0
Obs: [170. 180.   2.   7.]
Action: [0.15 0. 

In [42]:
# === Load the trained PPO model ===
model = PPO.load("dwf_rl_pricing_model_v6.zip")

# === Create the environment ===
env = RideHailingEnv(df.copy())  # or RideHailingEnv_Baseline

base_fare = env.df.iloc[env.current_idx]["Base_Fare"]

# === Run the model over the dataset and collect actions ===
incentives = 0
fare_adjustments = 0
completions = 0

obs = env.reset()
done = False

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)

    incentives = incentives + base_fare*action[0]
    fare_adjustments = fare_adjustments + action[1]
   
average_incentive = np.mean(incentives)
average_discount = np.mean(fare_adjustments)


# === Print Summary ===
print(f"\n📊 Evaluation Results:")
print(f"✅ Avg Incentive: ${average_incentive:.2f}")
print(f"✅ Avg Discount: ${average_discount:.2f}")




📊 Evaluation Results:
✅ Avg Incentive: $-10082.80
✅ Avg Discount: $0.45
