In [2]:
import tensorflow as tf
from tf_agents.agents import ddpg
from tf_agents.agents.td3 import td3_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

import sys
sys.path.insert(0, '..')
import Dataloader
import Environment

In [3]:
# Param for iteration
num_iterations = 10000

# Params for collect
initial_collect_steps = 1000
collect_steps_per_iteration = 1000
replay_buffer_capacity = 1000000
ou_stddev = 0.2
ou_damping = 0.15

# Params for target update
target_update_tau = 0.05
target_update_period = 5

# Params for train
exploration_noise_std = 0.1
actor_update_period = 2
train_steps_per_iteration = 1
batch_size = 1000
actor_learning_rate = 1e-4
critic_learning_rate = 1e-3
dqda_clipping = None
td_errors_loss_fn = tf.compat.v1.losses.huber_loss
gamma = 0.99
reward_scale_factor = 1.0
gradient_clipping = None

# Params for eval and checkpoints
num_eval_episodes = 1
eval_interval = 50

In [5]:
# Load data
data_train = Dataloader.get_customer_data(Dataloader.loadData('../../data/load1011.csv'),
                                          Dataloader.loadPrice('../../data/price.csv'), 1)
data_eval = Dataloader.get_customer_data(Dataloader.loadData('../../data/load1112.csv'),
                                         Dataloader.loadPrice('../../data/price.csv'), 1)

In [6]:
# Prepare runner
global_step = tf.compat.v1.train.get_or_create_global_step()

tf_env = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=10.0, data=data_train))
tf_env_eval = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=10.0, data=data_eval))

actor_net = ddpg.actor_network.ActorNetwork(input_tensor_spec=tf_env.observation_spec(),
                                           output_tensor_spec=tf_env.action_spec(), fc_layer_params=(400, 300),
                                           activation_fn=tf.keras.activations.relu)

critic_net = ddpg.critic_network.CriticNetwork(input_tensor_spec=(tf_env.observation_spec(), tf_env.action_spec()),
                                              joint_fc_layer_params=(400, 300),
                                              activation_fn=tf.keras.activations.relu)

tf_agent = td3_agent.Td3Agent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate
    ),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate
    ),
    exploration_noise_std=exploration_noise_std,
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    actor_update_period=actor_update_period,
    td_errors_loss_fn=td_errors_loss_fn,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    debug_summaries=False,
    summarize_grads_and_vars=False,
    train_step_counter=global_step,
)

tf_agent.initialize()

eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    tf_agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_capacity,
)

initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
    tf_env,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps,
)

collect_driver = dynamic_step_driver.DynamicStepDriver(
    tf_env,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=collect_steps_per_iteration,
)

train_checkpointer = common.Checkpointer(
    ckpt_dir='./checkpoints/td3/',
    max_to_keep=1,
    agent=tf_agent,
    policy=tf_agent.policy,
    replay_buffer=replay_buffer,
    global_step=global_step
)

eval_summary_writer = tf.compat.v2.summary.create_file_writer(
    logdir='./log/td3/', flush_millis=10000
)

eval_metrics = [
    tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes)
]

train_checkpointer.initialize_or_restore()
global_step = tf.compat.v1.train.get_global_step()




In [7]:
# For better performance
initial_collect_driver.run = common.function(initial_collect_driver.run)
collect_driver.run = common.function(collect_driver.run)
tf_agent.train = common.function(tf_agent.train)

# Collect initial replay data
initial_collect_driver.run()

time_step = tf_env.reset()
policy_state = collect_policy.get_initial_state(tf_env.batch_size)

# Dataset generates trajectories with shape [Bx2x...]
# pipeline which will feed data to the agent
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2
).prefetch(3)
iterator = iter(dataset)

  updates the state based on the action taken, and returns the next TimeStep object,
  which encapsulates the new state, reward, and whether the episode has ended.
  :return: next TimeStep


Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.


Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [8]:
with tf.compat.v2.summary.record_if(True):
    metric_utils.eager_compute(
        eval_metrics,
        tf_env_eval,
        eval_policy,
        num_episodes=num_eval_episodes,
        train_step=global_step,
        summary_writer=eval_summary_writer,
        summary_prefix='Metrics')

    # Train and evaluate
    while global_step.numpy() <= num_iterations:
        time_step, policy_state = collect_driver.run(
            time_step=time_step,
            policy_state=policy_state,
        )
        experience, _ = next(iterator)
        train_loss = tf_agent.train(experience)
        print('step = {0}: Loss = {1}'.format(global_step.numpy(), train_loss.loss))
        with eval_summary_writer.as_default():
            tf.summary.scalar(name='loss', data=train_loss.loss, step=global_step)
        if global_step.numpy() % eval_interval == 0:
            train_checkpointer.save(global_step)
            metric_utils.eager_compute(
                eval_metrics,
                tf_env_eval,
                eval_policy,
                num_episodes=num_eval_episodes,
                train_step=global_step,
                summary_writer=eval_summary_writer,
                summary_prefix='Metrics')

step = 1: Loss = 152.95082092285156
step = 2: Loss = 151.955810546875
step = 3: Loss = 150.98739624023438
step = 4: Loss = 151.45814514160156
step = 5: Loss = 151.27590942382812
step = 6: Loss = 148.87506103515625
step = 7: Loss = 149.77540588378906
step = 8: Loss = 147.53358459472656
step = 9: Loss = 147.81134033203125
step = 10: Loss = 144.5395965576172
step = 11: Loss = 142.4980010986328
step = 12: Loss = 139.12059020996094
step = 13: Loss = 136.4272003173828
step = 14: Loss = 133.225830078125
step = 15: Loss = 130.8792724609375
step = 16: Loss = 126.95706939697266
step = 17: Loss = 123.0464859008789
step = 18: Loss = 118.03276062011719
step = 19: Loss = 114.2195816040039
step = 20: Loss = 106.9664306640625
step = 21: Loss = 102.54523468017578
step = 22: Loss = 95.34961700439453
step = 23: Loss = 91.01310729980469
step = 24: Loss = 86.27420043945312
step = 25: Loss = 81.82112884521484
step = 26: Loss = 79.2520980834961
step = 27: Loss = 80.32582092285156
step = 28: Loss = 82.7070083



step = 201: Loss = 32.6592903137207
step = 202: Loss = 33.236106872558594
step = 203: Loss = 32.823402404785156
step = 204: Loss = 33.61589050292969
step = 205: Loss = 33.03923416137695
step = 206: Loss = 32.742042541503906
step = 207: Loss = 33.338600158691406
step = 208: Loss = 32.356475830078125
step = 209: Loss = 33.45854568481445
step = 210: Loss = 33.40104293823242
step = 211: Loss = 33.74993896484375
step = 212: Loss = 33.389312744140625
step = 213: Loss = 34.368812561035156
step = 214: Loss = 33.97489929199219
step = 215: Loss = 33.83698272705078
step = 216: Loss = 34.15704345703125
step = 217: Loss = 35.59551239013672
step = 218: Loss = 35.81010818481445
step = 219: Loss = 36.364017486572266
step = 220: Loss = 35.60051727294922
step = 221: Loss = 35.32924270629883
step = 222: Loss = 33.490386962890625
step = 223: Loss = 33.366668701171875
step = 224: Loss = 32.3648796081543
step = 225: Loss = 33.368080139160156
step = 226: Loss = 32.65901565551758
step = 227: Loss = 33.7081527



step = 251: Loss = 27.627836227416992
step = 252: Loss = 27.914352416992188
step = 253: Loss = 27.166934967041016
step = 254: Loss = 25.836753845214844
step = 255: Loss = 25.427640914916992
step = 256: Loss = 26.025718688964844
step = 257: Loss = 26.56068229675293
step = 258: Loss = 26.466760635375977
step = 259: Loss = 26.876468658447266
step = 260: Loss = 27.122901916503906
step = 261: Loss = 25.55857276916504
step = 262: Loss = 25.77020263671875
step = 263: Loss = 25.200904846191406
step = 264: Loss = 25.633378982543945
step = 265: Loss = 26.43639373779297
step = 266: Loss = 26.296428680419922
step = 267: Loss = 26.81468963623047
step = 268: Loss = 26.723182678222656
step = 269: Loss = 26.30539894104004
step = 270: Loss = 25.550765991210938
step = 271: Loss = 25.461185455322266
step = 272: Loss = 26.071453094482422
step = 273: Loss = 26.552839279174805
step = 274: Loss = 26.98684310913086
step = 275: Loss = 26.337112426757812
step = 276: Loss = 25.903121948242188
step = 277: Loss = 

KeyboardInterrupt: 

In [None]:
# Test
data_test = Dataloader.get_customer_data(Dataloader.loadData('./data/load1213.csv'),
                                         Dataloader.loadPrice('./data/price.csv'), 1)
tf_env_test = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=10.0, data=data_test, test=True))
time_step_test = tf_env_test.reset()

while not time_step_test.is_last():
    action_step = tf_agent.policy.action(time_step_test)
    time_step_test = tf_env_test.step(action_step.action)
