In [None]:
%%capture
!pip install -U ray
!pip install lz4
!pip install or-gym
!pip install ray[rllib]

In [19]:
%%capture
import or_gym
from or_gym.utils import create_env
import ray
from ray.rllib import agents
from ray import tune
import pandas as pd
import numpy as np

In [20]:
def register_env(env_name, env_config={}):
    env = create_env(env_name)
    tune.register_env(env_name, 
        lambda env_name: env(env_name,
            env_config=env_config))

In [21]:
env = or_gym.make('InvManagement-v1')
env_var={'Initial_Inventory':env.I0,'Units_BackLog_Price':env.k,'Production_Capacity':env.c,'Units_Replenishment_Cost':env.r,'Units_Holding_Cost':env.h,'Lead_Times':env.L}


In [22]:
Initial_Inventory=env.I0
Units_BackLog_Cost=env.k
Production_Capacity=env.c
Units_Replenishment_Cost=env.r
Units_Holding_Cost=env.h
Lead_Times=env.L


In [23]:
env_name = 'InvManagement-v1'
env_config = {'Initial_Inventory': np.array([50, 150,200,'-']),
              'Units_BackLog_Price': np.array([0.1, 0.55,0.075,0.05]),
              'Units_Replenishment_Cost': np.array([2,1.75,0.75,0.5]),
              'Units_Holding_Cost': np.array([0.15, 0.1,0.05,'-']),
              'Lead_Times':  np.array([3, 5,10,'-']),
              'Production_Capacity':np.array([100,90,80,'-']),
              }



In [24]:
pd.DataFrame(env_config,index=['Stage_0','Stage_1','Stage_2','Stage_3']).T.style

Unnamed: 0,Stage_0,Stage_1,Stage_2,Stage_3
Initial_Inventory,50.0,150.0,200.0,-
Units_BackLog_Price,0.1,0.55,0.075,0.050000
Units_Replenishment_Cost,2.0,1.75,0.75,0.500000
Units_Holding_Cost,0.15,0.1,0.05,-
Lead_Times,3.0,5.0,10.0,-
Production_Capacity,100.0,90.0,80.0,-


In [25]:
# Environment and RL Configuration Settings
rl_config = dict(
    env=env_name,
    num_workers=10,
    
    env_config=env_config,
    model=dict(
        vf_share_layers=False,
        fcnet_activation='relu',
        fcnet_hiddens=[256, 256]
    ),
    framework='torch',
    lr=1e-3,
    clip_param= 0.1
)


In [7]:
import torch
torch.cuda.is_available()

True

In [26]:
import time

## Actor-Critic Method




In [None]:
# Register environment
start=time.time()
ray.shutdown()
register_env(env_name, env_config)

# Initialize Ray and Build Agent
ray.init(ignore_reinit_error=True,num_cpus=10)

agent = agents.a3c.A2CTrainer(env=env_name,
    config=rl_config)
results = []
for i in range(800):
    res = agent.train()
    results.append(res)
    if (i+1) % 5 == 0:
        print('\rIter: {}\tReward: {:.2f}'.format(
                i+1, res['episode_reward_mean']), end='')

ray.shutdown()
end=time.time()

In [33]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
# Unpack values from each iteration
rewards = np.hstack([i['hist_stats']['episode_reward'] 
    for i in results])
pol_loss = [
    i['info']['learner']['default_policy']['learner_stats']['policy_loss'] 
    for i in results]
vf_loss = [
    i['info']['learner']['default_policy']['learner_stats']['vf_loss'] 
    for i in results]
p = 100

mean_rewards = np.array([np.mean(rewards[i-p:i+1]) 
                if i >= p else np.mean(rewards[:i+1]) 
                for i, _ in enumerate(rewards)])
std_rewards = np.array([np.std(rewards[i-p:i+1])
               if i >= p else np.std(rewards[:i+1])
               for i, _ in enumerate(rewards)])

fig = plt.figure(constrained_layout=True, figsize=(20, 10))
gs = fig.add_gridspec(2, 4)
ax0 = fig.add_subplot(gs[:, :-2])
ax0.fill_between(np.arange(len(mean_rewards)), 
                 mean_rewards - std_rewards, 
                 mean_rewards + std_rewards, 
                 label='Standard Deviation', alpha=0.3)
ax0.plot(mean_rewards, label='Mean Rewards')
ax0.set_ylabel('Rewards')
ax0.set_xlabel('Episode')
ax0.set_title('Training Rewards')
ax0.legend()
ax1 = fig.add_subplot(gs[0, 2:])
ax1.plot(pol_loss)
ax1.set_ylabel('Loss')
ax1.set_xlabel('Iteration')
ax1.set_title('Policy Loss')
ax2 = fig.add_subplot(gs[1, 2:])
ax2.plot(vf_loss)
ax2.set_ylabel('Loss')
ax2.set_xlabel('Iteration')
ax2.set_title('Value Function Loss')
#plt.suptitle('Asynchronous Advantage Actor-Critic',fontsize=20)
plt.savefig("Results_ActorCritic.png", dpi=300)
plt.show()

Figure(1440x720)


In [29]:
import pickle
filename='results_A3C.sav'
#pickle.dump(results, open(filename, 'wb'))

results = pickle.load(open(filename, 'rb'))

In [38]:
import pickle
filename_3='results_ppo_0.3.sav'
filename_2='results_ppo_0.2.sav'
results_ppo_3 = pickle.load(open(filename_3, 'rb'))
results_ppo_2 = pickle.load(open(filename_2, 'rb'))

In [44]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
# Unpack values from each iteration



p = 100

rewards_ppo_3 = np.hstack([i['hist_stats']['episode_reward'] 
    for i in results_ppo_3])
pol_loss_ppo_3 = [
    i['info']['learner']['default_policy']['learner_stats']['policy_loss'] 
    for i in results_ppo_3]
vf_loss_ppo_3 = [
    i['info']['learner']['default_policy']['learner_stats']['vf_loss'] 
    for i in results_ppo_3]


mean_rewards_3 = np.array([np.mean(rewards_ppo_3[i-p:i+1]) 
                if i >= p else np.mean(rewards_ppo_3[:i+1]) 
                for i, _ in enumerate(rewards_ppo_3)])
std_rewards_3 = np.array([np.std(rewards[i-p:i+1])
               if i >= p else np.std(rewards_ppo_3[:i+1])
               for i, _ in enumerate(rewards_ppo_3)])



rewards_ppo_2 = np.hstack([i['hist_stats']['episode_reward'] 
    for i in results_ppo_2])
pol_loss_ppo_2 = [
    i['info']['learner']['default_policy']['learner_stats']['policy_loss'] 
    for i in results_ppo_2]
vf_loss_ppo_2 = [
    i['info']['learner']['default_policy']['learner_stats']['vf_loss'] 
    for i in results_ppo_2]


mean_rewards_2 = np.array([np.mean(rewards_ppo_2[i-p:i+1]) 
                if i >= p else np.mean(rewards_ppo_2[:i+1]) 
                for i, _ in enumerate(rewards_ppo_2)])
std_rewards_2 = np.array([np.std(rewards_ppo_2[i-p:i+1])
               if i >= p else np.std(rewards_ppo_2[:i+1])
               for i, _ in enumerate(rewards_ppo_2)])





fig = plt.figure(constrained_layout=True, figsize=(20, 10))
gs = fig.add_gridspec(2, 4)
ax0 = fig.add_subplot(gs[:, :-2])
ax0.fill_between(np.arange(len(mean_rewards)), 
                 mean_rewards - std_rewards, 
                 mean_rewards + std_rewards, 
                 label='Standard Deviation', alpha=0.3)
ax0.plot(mean_rewards_3, label='PPO Clip Gradient 0.3')
ax0.plot(mean_rewards_2, label='PPO Clip Gradient 0.2')
ax0.set_ylabel('Rewards')
ax0.set_xlabel('Episode')
ax0.set_title('Training Rewards')
ax0.legend()
ax1 = fig.add_subplot(gs[0, 2:])
ax1.plot(pol_loss_ppo_3,label='PPO Clip Gradient 0.3')
ax1.plot(pol_loss_ppo_2,label='PPO Clip Gradient 0.2')
ax1.set_ylabel('Loss')
ax1.set_xlabel('Iteration')
ax1.set_title('Policy Loss')
ax2 = fig.add_subplot(gs[1, 2:])
ax2.plot(vf_loss_ppo_3,label='PPO Clip Gradient 0.3')
ax2.plot(vf_loss_ppo_2,label='PPO Clip Gradient 0.2')
ax2.set_ylabel('Loss')
ax2.set_xlabel('Iteration')
ax2.set_title('Value Function Loss')
plt.savefig("Testing.png", dpi=300)
plt.show()

Figure(1440x720)
