In [5]:
import numpy as np
import pandas as pd
import pickle   # for saving/loading models
from sklearn.preprocessing import StandardScaler
import joblib

In [6]:
import joblib
features_df = pd.read_parquet("panel_augmented2.parquet")
model_filepath = "market_simulator_model2.joblib"
demand_model = joblib.load(model_filepath)

In [7]:
model_features = [
    "lag_units_1w",
    "lag_units_2w",
    "lag_units_3w",
    "lag_units_4w",
    "rolling_mean_4w",
    "rolling_std_4w",
    "rolling_mean_8w",
    "rolling_std_8w",
    "rolling_mean_12w",
    "rolling_std_12w",
    "unit_price",
    "discount_depth",
    "price_change",
    "price_vs_ref_ratio",
    "ref_price",
    "promo_flag",
    "promo_code_encoded",
    "month",
    "quarter",
    "weekofyr",
    "is_month_start",
    "is_month_end",
    "has_special_event",
    "temp_mean",
    "temp_max",
    "precip_sum",
    "weather_missing_flag",
    "cpi_bev",
    "brand_encoded",
    "margin_pct",
    "store"
]

In [24]:
upc_list = [7800000075,7336070997,1690000302,4900000981,7241009310,5490000029,4100010728,1200000085,1200000013,1660000064,4900000551]

In [8]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class MultiUPCEnv(gym.Env):
    def __init__(
        self,
        demand_model,
        features_df,
        upc_list,
        price_change_bounds=(-0.30, 0.30),
        noise_std=0.02
    ):
        super(MultiUPCEnv, self).__init__()

        self.demand_model = demand_model
        self.features_df = features_df
        self.upc_list = upc_list
        self.price_change_bounds = price_change_bounds
        self.noise_std = noise_std

        self.current_upc = None
        self.week_idx_ptr = None
        self.weeks = None
        self.current_price = None

        # --------------------------
        # OBSERVATION SPACE (continuous)
        # --------------------------
        self.state_features = [
            f for f in model_features if f not in ["unit_price", "price_change"]
        ]

        self.obs_dim = len(self.state_features) + 3   # price, prev_units, prev_price_change
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32
        )

        # --------------------------
        # ACTION SPACE (continuous)
        # PPO outputs value in [-1, 1]; we scale it to price_change_bounds
        # --------------------------
        self.action_space = spaces.Box(
            low=np.array([-1.0]), high=np.array([1.0]), dtype=np.float32
        )

    def _scale_action(self, a):
        """convert [-1,1] → [min_change,max_change]"""
        low, high = self.price_change_bounds
        return low + (a + 1) * (high - low) / 2

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
    
        self.current_upc = np.random.choice(self.upc_list)
        df_u = self.features_df[self.features_df["upc"] == self.current_upc]
    
        self.weeks = sorted(df_u["week_x"].unique())
        self.week_idx_ptr = 0
    
        row = df_u[df_u["week_x"] == self.weeks[0]].iloc[0]
        self.current_price = float(row["unit_price"])
    
        self.prev_units = 0.0
        self.prev_price_change = 0.0
    
        obs = self._get_state(row)
        return obs, {}


    def _get_state(self, row):
        base_state = np.array(
            [row[feat] for feat in self.state_features], dtype=np.float32
        )
        extended = np.array(
            [self.current_price, self.prev_units, self.prev_price_change],
            dtype=np.float32
        )
        return np.nan_to_num(np.concatenate([base_state, extended]))

    def step(self, action):

        df_u = self.features_df[self.features_df["upc"] == self.current_upc]
    
        # scale action from [-1, 1] to price_change_bounds
        price_change = float(self._scale_action(float(action[0])))
    
        new_price = float(
            np.clip(self.current_price * (1.0 + price_change), 0.5, 10.0)
        )
    
        row = df_u[df_u["week_x"] == self.weeks[self.week_idx_ptr]].iloc[0]
    
        # prepare input for demand model
        model_input = row.copy()
        model_input["unit_price"] = new_price
        model_input["price_change"] = price_change
    
        X = model_input[model_features].values.reshape(1, -1)
        base_units = self.demand_model.predict(X)[0]
        units = max(base_units * (1 + np.random.normal(0, self.noise_std)), 0)
    
        cost = row.get("cost_real", row.get("unit_cost", 0))
        profit = (new_price - cost) * units
    
        reward = profit / 100.0
    
        # update stored vars
        self.prev_units = units
        self.prev_price_change = price_change
        self.current_price = new_price
    
        # move to next week
        self.week_idx_ptr += 1
    
        terminated = self.week_idx_ptr >= len(self.weeks)
        truncated = False  # PPO never truncates episodes
    
        info = {"profit": profit, "price": new_price, "units": units}
    
        if terminated:
            # return dummy obs required by SB3
            obs = np.zeros(self.obs_dim, dtype=np.float32)
            return obs, reward, terminated, truncated, info
    
        next_row = df_u[df_u["week_x"] == self.weeks[self.week_idx_ptr]].iloc[0]
        next_state = self._get_state(next_row)
    
        return next_state, reward, terminated, truncated, info


In [8]:
!pip install --user stable-baselines3


Collecting stable-baselines3
  Using cached stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Using cached stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
Installing collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0


In [11]:
import site
import sys
sys.path.append(site.USER_SITE)


In [13]:
import sys
sys.path.append("./local_lib")

try:
    from stable_baselines3 import PPO
    print("✅ Stable-Baselines3 is ready!")
except Exception as e:
    print("❌ Import error:", e)


✅ Stable-Baselines3 is ready!


In [22]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

def make_env():
    return Monitor(MultiUPCEnv(
        demand_model=demand_model,
        features_df=features_df,
        upc_list=upc_list,
        noise_std=0.02
    ))

# Vectorized env (helps with PPO stability)
env = DummyVecEnv([make_env])

# PPO model
model = PPO(
    policy="MlpPolicy",
    env=env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=256,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    verbose=1,
    tensorboard_log="./ppo_logs/"
)

# Train
model.learn(total_timesteps=500_000)
model.save("ppo_pricing_model")


Using cuda device
Logging to ./ppo_logs/PPO_2


  gym.logger.warn(
  gym.logger.warn(


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 389      |
|    ep_rew_mean     | -18.4    |
| time/              |          |
|    fps             | 51       |
|    iterations      | 1        |
|    time_elapsed    | 39       |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 388          |
|    ep_rew_mean          | 112          |
| time/                   |              |
|    fps                  | 49           |
|    iterations           | 2            |
|    time_elapsed         | 82           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0096813785 |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -0.0541      |
|    learning_r

In [1]:
from stable_baselines3 import PPO
model = PPO.load("ppo_pricing_model")


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
def simulate_upc(upc):
    env = MultiUPCEnv(
        demand_model=demand_model,
        features_df=features_df,
        upc_list=upc_list,       # force env to use only this UPC
        noise_std=0.0
    )
    
    obs, _ = env.reset()
    total_profit = 0
    done = False
    truncated = False

    while not done and not truncated:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)
        total_profit += info["profit"]

    return total_profit


In [26]:
profits = {}
for upc in upc_list:
    p = simulate_upc(upc)
    profits[upc] = p
    print(f"UPC {upc}: Profit = ${p:,.2f}")


  gym.logger.warn(
  gym.logger.warn(


UPC 7800000075: Profit = $39,089.13


  gym.logger.warn(
  gym.logger.warn(


UPC 7336070997: Profit = $19,536.66


  gym.logger.warn(
  gym.logger.warn(


UPC 1690000302: Profit = $1,353.05


  gym.logger.warn(
  gym.logger.warn(


UPC 4900000981: Profit = $82,915.74


  gym.logger.warn(
  gym.logger.warn(


UPC 7241009310: Profit = $1,353.05


  gym.logger.warn(
  gym.logger.warn(


UPC 5490000029: Profit = $20,363.10


  gym.logger.warn(
  gym.logger.warn(


UPC 4100010728: Profit = $20,363.10


  gym.logger.warn(
  gym.logger.warn(


UPC 1200000085: Profit = $23,400.31


  gym.logger.warn(
  gym.logger.warn(


UPC 1200000013: Profit = $39,089.13


  gym.logger.warn(
  gym.logger.warn(


UPC 1660000064: Profit = $19,738.04


  gym.logger.warn(
  gym.logger.warn(


UPC 4900000551: Profit = $19,536.66


In [10]:
upc_list = [418,419,422,425,427,430]

In [11]:
profits = {}
for upc in upc_list:
    p = simulate_upc(upc)
    profits[upc] = p
    print(f"UPC {upc}: Profit = ${p:,.2f}")


  gym.logger.warn(
  gym.logger.warn(


UPC 418: Profit = $1,832.19


  gym.logger.warn(
  gym.logger.warn(


UPC 419: Profit = $1,869.04


  gym.logger.warn(
  gym.logger.warn(


UPC 422: Profit = $1,832.19


  gym.logger.warn(
  gym.logger.warn(


UPC 425: Profit = $1,832.19


  gym.logger.warn(
  gym.logger.warn(


UPC 427: Profit = $2,777.75


  gym.logger.warn(
  gym.logger.warn(


UPC 430: Profit = $1,869.04


In [12]:
upc_list = [1200000230, 4900000639, 420, 4900000663, 1200000231, 1200000013, 450, 4900001884, 4900000634, 1200000844]

In [13]:
profits = {}
for upc in upc_list:
    p = simulate_upc(upc)
    profits[upc] = p
    print(f"UPC {upc}: Profit = ${p:,.2f}")


  gym.logger.warn(
  gym.logger.warn(


UPC 1200000230: Profit = $409,403.53


  gym.logger.warn(
  gym.logger.warn(


UPC 4900000639: Profit = $409,403.53


  gym.logger.warn(
  gym.logger.warn(


UPC 420: Profit = $163,094.96


  gym.logger.warn(
  gym.logger.warn(


UPC 4900000663: Profit = $372,639.13


  gym.logger.warn(
  gym.logger.warn(


UPC 1200000231: Profit = $31,242.37


  gym.logger.warn(
  gym.logger.warn(


UPC 1200000013: Profit = $329,353.69


  gym.logger.warn(
  gym.logger.warn(


UPC 450: Profit = $31,242.37


  gym.logger.warn(
  gym.logger.warn(


UPC 4900001884: Profit = $143,256.08


  gym.logger.warn(
  gym.logger.warn(


UPC 4900000634: Profit = $329,353.69


  gym.logger.warn(
  gym.logger.warn(


UPC 1200000844: Profit = $296,294.72
