In [1]:

import numpy as np
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv


In [2]:

class PackagingCenterEnv(gym.Env):
    def __init__(self):
        super(PackagingCenterEnv, self).__init__()
        self.observation_space = spaces.Box(low=0, high=1, shape=(3,), dtype=np.float32)  # [load, hour, zip_count]
        self.action_space = spaces.Discrete(2)  # 0 = do nothing, 1 = reroute
        self.current_step = 0
        self.max_steps = 50
        self.state = None
        self.feature_names = ['CHUTE_LOAD', 'HOUR', 'ZIP_COUNT']

    def reset(self, seed=None, options=None):
        self.current_step = 0
        self.state = np.array([0.4, 0.0, 1.0], dtype=np.float32)  # moderate load
        return self.state, {}

    def step(self, action):
        self.current_step += 1
        load, hour, zip_count = self.state

        # Simulate spike
        if self.current_step in [10, 20, 35]:
            load += 0.6  # spike

        # Processing delay
        delay = load * (1.2 if action == 0 else 0.9) + np.random.normal(0, 0.05)
        reward = -delay

        # Reward shaping: penalize overload
        if load > 0.9:
            reward -= 2
        if action == 1 and delay < 2:
            reward += 1

        # Clip and update state
        load = min(max(load + np.random.normal(0, 0.02), 0), 1)
        hour = (hour + 1) % 24
        zip_count = np.clip(zip_count + np.random.normal(0, 0.1), 0, 10)
        self.state = np.array([load, hour, zip_count], dtype=np.float32)

        done = self.current_step >= self.max_steps
        return self.state, reward, done, False, {}


In [3]:

env = DummyVecEnv([lambda: PackagingCenterEnv()])
model = DQN("MlpPolicy", env, verbose=0, tensorboard_log="./logs")
model.learn(total_timesteps=5000)


<stable_baselines3.dqn.dqn.DQN at 0x2b3c447e4e0>

In [6]:
obs = env.reset()
done = False
rewards = []
actions = []

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    rewards.append(reward[0])  # reward is in list format
    actions.append(int(action))

print("Total Reward:", sum(rewards))


Total Reward: -71.964874


  actions.append(int(action))
