In [1]:
import numpy as np
import gym
from gym import spaces

from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import DQN, A2C # PPO2, ACKTR
from stable_baselines3.common.cmd_util import make_vec_env

  from .autonotebook import tqdm as notebook_tqdm


In [63]:
# class OldInventory_backlogging(gym.Env):
#   def __init__(self, inv_bound, mu, sigma, h, b):
#     super(Inventory_backlogging, self).__init__()

#     # Size of the 1D-grid
#     self.inv_bound = inv_bound
#     self.h = h
#     self.b = b
#     self.mu = mu
#     self.sigma = sigma

#     # Initialize the agent at the right of the grid (inventroy=0, period=0)
#     self.inv = 0
#     self.period = 0
    
#     # Define action and observation space
#     # They must be gym.spaces objects
#     # Example when using discrete actions, we have two: left and right
#     n_actions = 20
#     self.action_UB = 20
#     self.action_space = spaces.Discrete(n_actions)

#     # The observation will be the coordinate of the agent
#     # this can be described both by Discrete and Box space
#     self.observation_space = spaces.Box(
#         low=-self.inv_bound, high=self.inv_bound, shape=(1,), dtype=np.float32
#       )
#     # self.period = 0

#   def reset(self):
#     """
#     Important: the observation must be a numpy array
#     :return: (np.array) 
#     """
#     # Initialize the agent at the right of the grid (set inventory to 0 and period to 0)
#     self.inv = 0
#     self.period = 0
#     # here we convert to float32 to make it more general (in case we want to use continuous actions)
#     return np.array([self.inv]).astype(np.float32)

  def step(self, action):
    self.period +=1
    self.inv += action
    demand = np.random.normal(loc=self.mu, scale=self.sigma)
    self.inv -= demand
    
    # Account for the boundaries of the grid
    self.inv = np.clip(self.inv, -self.inv_bound, self.inv_bound)

    # Are we at the left of the grid?

    # Stop trajectory when I hit period 10000
    done = bool(self.period == 10000)

    
    # Null reward everywhere except when reaching the goal (left of the grid)
    
    # If inventory is greater than 0 -> we have a holding cost
    if self.inv >= 0:
      reward = -self.inv*self.h
      
    # If inventory i negative -> we have a backlog cost
    if self.inv < 0:
      reward = self.inv*self.b
   
    
    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.inv]).astype(np.float32), reward, done, info

#   def render(self, mode='console'):
#     if mode != 'console':
#       raise NotImplementedError()
#     print(self.inv)

#   def close(self):
#     pass

In [428]:
class Inventory_backlogging(gym.Env):
    def __init__(self, arr_rate, prop_rate_1, prop_rate_2, queue_capacity, max_n_events=10000, inf_reward=-9999):
        super(Inventory_backlogging, self).__init__()

        self.arr_rate = arr_rate
        self.prop_rate_1 = prop_rate_1
        self.prop_rate_2 = prop_rate_2

        self.queue_capacity = queue_capacity

        self.max_n_events = max_n_events
        self.inf_reward = inf_reward
        
        self.state = np.array([0, 0, 0])
        self.actions = [0, 1, 2, 3] 

        self.n_actions = len(self.actions)
        self.action_space = spaces.Discrete(self.n_actions)

        self.observation_space = spaces.MultiDiscrete((queue_capacity + 1, 2, 2))
        self.period = 0

    @property
    def tot_prob(self):
        return self.arr_rate + self.prop_rate_1 + self.prop_rate_2


    def get_possible_actions(self):
        """
        Get all possible actions
        """
        possible_actions = []
        if self.state[0] == 0 or not(self.state[0] > 0 and sum(self.state[1:]) == 0):
            possible_actions.append(0)
        if self.state[1] == 0 and self.state[0] >= 1:
            possible_actions.append(1)
        if self.state[2] == 0 and self.state[0] >= 1:
            possible_actions.append(2)
        if self.state[1] == 0 and self.state[2] == 0 and self.state[0] >= 2:
            possible_actions.append(3)
        return possible_actions
        


    def reset(self):
        """
        Reset the environment
        """
        self.state = np.array([0, 0, 0])
        self.period = 0
        return self.state.astype(np.float32)


    def action_transition(self, action):
        """
        Transition function
        """
        if action not in self.get_possible_actions():
            return self.inf_reward
        if action == 0:
            return -sum(self.state)
        if action == 1:
            self.state[0] -= 1
            self.state[1] += 1
            return -sum(self.state)
        if action == 2:
            self.state[0] -= 1
            self.state[2] += 1
            return -sum(self.state)
        if action == 3:
            self.state[0] -= 2
            self.state[1] += 1
            self.state[2] += 1
            return -sum(self.state)


    def event_transition(self):
        """
        Transition function
        """
        next_event = np.random.choice(
            [0, 1, 2], 
            p=[
                self.arr_rate / self.tot_prob,
                self.prop_rate_1 / self.tot_prob,
                self.prop_rate_2 / self.tot_prob
            ]
        )
        if next_event == 0:
            self.state[0] = min(self.state[0] + 1, self.queue_capacity)
        elif next_event == 1:
            self.state[1] = max(self.state[1] - 1, 0)
        elif next_event == 2:
            self.state[2] = max(self.state[2] - 1, 0)


    def step(self, action):
        """
        Perform one time step within the environment
        """
        self.period += 1
        reward = self.action_transition(action)
        self.event_transition()

        done = self.period >= self.max_n_events

        info = {}
        
        return self.state.astype(np.float32), reward, done, info


    def render(self, mode='console'):
        if mode != 'console':
            raise NotImplementedError()
        print(self.inv)


    def close(self):
        pass 

In [430]:
# Instantiate the env
env = Inventory_backlogging(
    arr_rate=2,
    prop_rate_1=1.5,
    prop_rate_2=0.5,
    queue_capacity=100,
    max_n_events=10000,
    inf_reward=-9999
)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

In [431]:
obs = env.reset()
n_steps = 10000
rewards = 0
step = 0

In [432]:
action, _ = model.predict(obs, deterministic=True)
print("Start state: ", obs)
obs, reward, done, info = env.step(action)
print("Finish state: ", obs)
print("action: ", action)
print("reward: ", reward)

Start state:  [[0 0 0]]
Finish state:  [[0 0 0]]
action:  [3]
reward:  [0.]


In [433]:
# Train the agent

# Verbose = 1 -> the verbosity level: 0 none, 1 training information, 2 tensorflow debug 
model = A2C('MlpPolicy', env, verbose=1).learn(10000)

# model = ACKTR('MlpPolicy', env, verbose=1).learn(500000)
# model = DQN('MlpPolicy',env,verbose=1).learn(100000)

Using cpu device
-------------------------------------
| time/                 |           |
|    fps                | 917       |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.02     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 13.1      |
|    value_loss         | 141       |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 930      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.477   |
|    explained_variance | 0.00108  |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 9.56     |
|    va

In [434]:
# Test the trained agent
obs = env.reset()
n_steps = 100000
rewards = 0

for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print(f"Step {step + 1}")
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
  rewards += reward[0]
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  [2]
obs= [[0 0 0]] reward= [0.] done= [False]
Step 2
Action:  [2]
obs= [[0 0 0]] reward= [0.] done= [False]
Step 3
Action:  [2]
obs= [[0 0 0]] reward= [0.] done= [False]
Step 4
Action:  [2]
obs= [[0 0 0]] reward= [0.] done= [False]
Step 5
Action:  [2]
obs= [[0 0 0]] reward= [0.] done= [False]
Step 6
Action:  [2]
obs= [[1 0 0]] reward= [0.] done= [False]
Step 7
Action:  [2]
obs= [[0 0 1]] reward= [104.] done= [False]
Step 8
Action:  [0]
obs= [[1 0 1]] reward= [104.] done= [False]
Step 9
Action:  [0]
obs= [[1 0 1]] reward= [103.] done= [False]
Step 10
Action:  [0]
obs= [[1 0 0]] reward= [103.] done= [False]
Step 11
Action:  [2]
obs= [[1 0 1]] reward= [104.] done= [False]
Step 12
Action:  [0]
obs= [[1 0 1]] reward= [103.] done= [False]
Step 13
Action:  [0]
obs= [[1 0 1]] reward= [103.] done= [False]
Step 14
Action:  [0]
obs= [[2 0 1]] reward= [103.] done= [False]
Step 15
Action:  [0]
obs= [[3 0 1]] reward= [102.] done= [False]
Step 16
Action:  [0]
obs= [[4 0 1]] reward= [1

In [420]:
rewards/n_steps

-0.5984