To compare the performance of different agents,
- Agents that need to be trained are trained for 100 episodes of length 400
- They are tested for 30 epoches of length 2000, (during these epoches, they keep learning)

In [1]:
""" A saved test result (Since the training and the testing take time, 
this is a saved result to have a first glance at the result.)"""

import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt

dt = pd.read_csv( 'ddpg_smart_eoq_100_fold_run_performance.csv' )
dt = dt/2000
for name in dt.columns:
    sns.distplot( [ _ for _ in dt[name] if _ > 0],label='baseline' if name == 'benchmark' else name,
                 hist=False,rug=True,axlabel='score')
plt.title('Distribution of Score for Different Approaches')
dt.describe()

Unnamed: 0,benchmark,ddpg,smart
count,100.0,100.0,100.0
mean,8365.317165,8398.64215,8362.187664
std,66.544301,75.596317,868.468901
min,8149.2895,8223.812,-169.2386
25%,8328.686375,8344.48675,8406.024625
50%,8364.3935,8399.05075,8442.735
75%,8411.912375,8452.493125,8501.695875
max,8551.2625,8552.649,8646.927


According to the figure above, we can see that ddpg method does indeed marginally better than the baseline,
but the improvement is very small and unconvincing.

In [1]:
## Train a new agent or restore an agent 

import random
random.seed(0)
import numpy as np
np.random.seed(0)
import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(0)

import pandas as pd
import seaborn as sns

from SupplyChain.SequentialSupplyChain import SequentialSupplyChain
from Agents.RandomAgent import RandomAgent
from Agents.BenchmarkAgent import BenchmarkAgent
from Agents.SMARTAgent import SMARTAgent
from Agents.DDPGAgent import DDPGAgent

import main

env = SequentialSupplyChain()
rewards = dict()

print('Initialising agents...')
agents = {
    'ddpg':DDPGAgent( env ),
    'smart':SMARTAgent( env ),
    'baseline':BenchmarkAgent( env )
}

## maybe latest checkpoint
# checkpoint_path = 'tmp/model.ckpt'
# agent.restore( checkpoint_path )

## Training Block ##
agents_to_train = ['ddpg','smart']

for ag in agents_to_train:
    print( 'Training '+ag+' agent...' )
    main.train( agent = agents[ag], env = env )

Initialise agents...


100%|██████████| 100/100 [09:55<00:00,  5.95s/it]
100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


In [None]:
## Test Block ##

n = NUM_TEST_EPISODES = 50
l = LEN_TEST_EPISODE = lambda x:2000

for ag_name in agents:
    print( 'Testing '+ag_name+' agent...' )
    rewards[ag_name] = main.train( agent = agents[ag_name], env = env, len_episode = l, num_episodes = n)


  0%|          | 0/50 [00:00<?, ?it/s][A

Testing ddpg agent...



 46%|████▌     | 23/50 [11:39<13:41, 30.42s/it]

In [None]:
## Stats of Average Performance and Standard Deviation

def print_stat( r, l, name ):
    print(name+' Avr:{}, Std:{}'.format(np.mean(r)/l,np.std(r)/l))

for name in dt.columns:
    print_stat(dt[name],1,name)

In [4]:
for name in rewards:
    sns.distplot([a for a in rewards[name] if a > 8000 and a < 9000],label=name,hist=False,rug=True)
plt.legend()

In [3]:
## Run if save ddpg model

SAVE_DIR = 'tmp/model.ckpt'

saver = tf.train.Saver()
saver.save(agents['ddpg'].sess,SAVE_DIR)

'tmp/model.ckpt'

In [2]:
# Inspect how certain values change during the training process
# Show inspected keys 

from copy import deepcopy
inspected = deepcopy(agents['ddpg'].inspected_values)
inspected.keys()

dict_keys(['entropy', 'actor_loss', 'td_errors', 'predicted_action_values', 'critic_loss'])

In [5]:
## Choose an attribute to inspect
KEY = 'predicted_action_values'
LENGTH = 10000

import seaborn as sns
import numpy as np
fig = sns.lineplot( data = (np.array(inspected[KEY]))[:LENGTH])
fig.set( xlabel = 'episode', ylabel = KEY )