In [None]:
%%capture
!pip install -U ray
!pip install lz4
!pip install or-gym
!pip install ray[rllib]

In [1]:
%%capture
import or_gym
from or_gym.utils import create_env
import ray
from ray.rllib import agents
from ray import tune
import pandas as pd
import numpy as np

In [2]:
def register_env(env_name, env_config={}):
    env = create_env(env_name)
    tune.register_env(env_name, 
        lambda env_name: env(env_name,
            env_config=env_config))

In [3]:
env = or_gym.make('InvManagement-v1')
env_var={'Initial_Inventory':env.I0,'Units_BackLog_Price':env.k,'Production_Capacity':env.c,'Units_Replenishment_Cost':env.r,'Units_Holding_Cost':env.h,'Lead_Times':env.L}


In [4]:
Initial_Inventory=env.I0
Units_BackLog_Cost=env.k
Production_Capacity=env.c
Units_Replenishment_Cost=env.r
Units_Holding_Cost=env.h
Lead_Times=env.L


In [5]:
env_name = 'InvManagement-v1'
env_config = {'Initial_Inventory': np.array([50, 150,200,'-']),
              'Units_BackLog_Price': np.array([0.1, 0.55,0.075,0.05]),
              'Units_Replenishment_Cost': np.array([2,1.75,0.75,0.5]),
              'Units_Holding_Cost': np.array([0.15, 0.1,0.05,'-']),
              'Lead_Times':  np.array([3, 5,10,'-']),
              'Production_Capacity':np.array([100,90,80,'-']),
              }



In [6]:
pd.DataFrame(env_config,index=['Stage_0','Stage_1','Stage_2','Stage_3']).T.style

Unnamed: 0,Stage_0,Stage_1,Stage_2,Stage_3
Initial_Inventory,50.0,150.0,200.0,-
Units_BackLog_Price,0.1,0.55,0.075,0.050000
Units_Replenishment_Cost,2.0,1.75,0.75,0.500000
Units_Holding_Cost,0.15,0.1,0.05,-
Lead_Times,3.0,5.0,10.0,-
Production_Capacity,100.0,90.0,80.0,-


In [14]:
# Environment and RL Configuration Settings
rl_config = dict(
    env=env_name,
    num_workers=8,
    
    env_config=env_config,
    model=dict(
        vf_share_layers=False,
        fcnet_activation='relu',
        fcnet_hiddens=[256, 256]
    ),
    framework='torch',
    lr=1e-5,
    clip_param= 0.2
)


In [15]:
# Register environment



ray.shutdown()
register_env(env_name, env_config)

# Initialize Ray and Build Agent
ray.init(ignore_reinit_error=True)

agent = agents.ppo.PPOTrainer(env=env_name,
    config=rl_config)
results = []
for i in range(800):
    res = agent.train()
    results.append(res)
    if (i+1) % 5 == 0:
        print('\nIter: {}\tReward: {:.2f}'.format(
                i+1, res['episode_reward_mean']), end='')
        print('\n')
%time        
ray.shutdown()

2021-08-21 20:00:05,228	INFO services.py:1245 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m



Iter: 5	Reward: -88.72


Iter: 10	Reward: -55.37


Iter: 15	Reward: 6.75


Iter: 20	Reward: 59.44


Iter: 25	Reward: 74.50


Iter: 30	Reward: 103.05


Iter: 35	Reward: 151.97


Iter: 40	Reward: 177.12


Iter: 45	Reward: 177.52


Iter: 50	Reward: 199.49


Iter: 55	Reward: 211.85


Iter: 60	Reward: 226.81


Iter: 65	Reward: 239.89


Iter: 70	Reward: 251.52


Iter: 75	Reward: 254.63


Iter: 80	Reward: 250.15


Iter: 85	Reward: 274.52


Iter: 90	Reward: 278.75


Iter: 95	Reward: 292.27


Iter: 100	Reward: 286.62


Iter: 105	Reward: 296.91


Iter: 110	Reward: 307.92


Iter: 115	Reward: 321.34


Iter: 120	Reward: 308.42


Iter: 125	Reward: 309.49


Iter: 130	Reward: 320.61


Iter: 135	Reward: 335.00


Iter: 140	Reward: 328.63


Iter: 145	Reward: 331.31


Iter: 150	Reward: 333.92


Iter: 155	Reward: 335.10


Iter: 160	Reward: 343.94


Iter: 165	Reward: 339.53


Iter: 170	Reward: 352.77


Iter: 175	Reward: 347.27


Iter: 180	Reward: 371.08


Iter: 185	Reward: 369.44


Iter: 190	Reward: 366.50




Iter: 745	Reward: 511.08


Iter: 750	Reward: 506.21






Iter: 755	Reward: 497.96


Iter: 760	Reward: 500.64






Iter: 765	Reward: 508.35


Iter: 770	Reward: 504.76


Iter: 775	Reward: 505.29






Iter: 780	Reward: 512.23






Iter: 785	Reward: 509.38


Iter: 790	Reward: 519.79






Iter: 795	Reward: 509.10


Iter: 800	Reward: 508.92

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [1]:
import torch
torch.cuda.is_available()

True

In [19]:
import tensorflow as tf 

In [16]:
import pickle
filename='results_ppo_0.2.sav'
pickle.dump(results, open(filename, 'wb'))

results_ppo_2_ = pickle.load(open(filename, 'rb'))

In [18]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
# Unpack values from each iteration
rewards = np.hstack([i['hist_stats']['episode_reward'] 
    for i in results])
pol_loss = [
    i['info']['learner']['default_policy']['learner_stats']['policy_loss'] 
    for i in results]
vf_loss = [
    i['info']['learner']['default_policy']['learner_stats']['vf_loss'] 
    for i in results]
p = 100

mean_rewards = np.array([np.mean(rewards[i-p:i+1]) 
                if i >= p else np.mean(rewards[:i+1]) 
                for i, _ in enumerate(rewards)])
std_rewards = np.array([np.std(rewards[i-p:i+1])
               if i >= p else np.std(rewards[:i+1])
               for i, _ in enumerate(rewards)])

fig = plt.figure(constrained_layout=True, figsize=(20, 10))
gs = fig.add_gridspec(2, 4)
ax0 = fig.add_subplot(gs[:, :-2])
ax0.fill_between(np.arange(len(mean_rewards)), 
                 mean_rewards - std_rewards, 
                 mean_rewards + std_rewards, 
                 label='Standard Deviation', alpha=0.3)
ax0.plot(mean_rewards, label='Mean Rewards')
ax0.set_ylabel('Rewards')
ax0.set_xlabel('Episode')
ax0.set_title('Training Rewards')
ax0.legend()
ax1 = fig.add_subplot(gs[0, 2:])
ax1.plot(pol_loss)
ax1.set_ylabel('Loss')
ax1.set_xlabel('Iteration')
ax1.set_title('Policy Loss')
ax2 = fig.add_subplot(gs[1, 2:])
ax2.plot(vf_loss)
ax2.set_ylabel('Loss')
ax2.set_xlabel('Iteration')
ax2.set_title('Value Function Loss')
plt.savefig("Results_PPO_2.png", dpi=300)
plt.show()

Figure(1440x720)


In [None]:
results

In [None]:
results[0].keys()