In [7]:
#Imports
import tensorflow as tf
from tf_agents.agents import ddpg
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
from tf_agents.policies import policy_saver
from tf_agents.policies import py_tf_eager_policy

import os
import sys
sys.path.insert(0, '..')
import utils.Dataloader as DL
import Environment

In [8]:
# Hyperparameter

# Param for iteration
num_iterations = 100
customer = 1
# Params for collect
initial_collect_steps = 1000
collect_steps_per_iteration = 2000
replay_buffer_capacity = 1000000
ou_stddev = 0.2
ou_damping = 0.15

# Params for target update
target_update_tau = 0.05
target_update_period = 5

# Params for train
train_steps_per_iteration = 1
batch_size = 1
actor_learning_rate = 1e-4
critic_learning_rate = 1e-3
dqda_clipping = None
td_errors_loss_fn = tf.compat.v1.losses.huber_loss
gamma = 0.99
reward_scale_factor = 1.0
gradient_clipping = None

# Params for eval and checkpoints
num_eval_episodes = 1
eval_interval = 50

In [9]:
# Load data
data_train = DL.get_customer_data(DL.loadData('../../data/load1011.csv'), DL.loadPrice('../../data/price.csv'), customer)
data_eval = DL.get_customer_data(DL.loadData('../../data/load1112.csv'), DL.loadPrice('../../data/price.csv'), customer)

In [10]:
# Prepare Agent

# Get or create the global step variable, which is a counter for the number of training steps
global_step = tf.compat.v1.train.get_or_create_global_step()

# Create TensorFlow environments for training and evaluation using custom environment settings
tf_env_train = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=0.0, data=data_train))
tf_env_eval = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=0.0, data=data_eval))

## Define the actor network, responsible for generating actions based on observations
actor_net = ddpg.actor_network.ActorNetwork(
    input_tensor_spec=tf_env_train.observation_spec(),
    output_tensor_spec=tf_env_train.action_spec(), 
    fc_layer_params=(400, 300), # Define the architecture of the fully connected layers
    activation_fn=tf.keras.activations.relu
)

# Define the critic network, responsible for estimating the Q-values for state-action pairs
critic_net = ddpg.critic_network.CriticNetwork(
    input_tensor_spec=(tf_env_train.observation_spec(), tf_env_train.action_spec()),
    joint_fc_layer_params=(400, 300), # Define the architecture of the fully connected layers
    activation_fn=tf.keras.activations.relu
)

# Create a DDPG agent using the defined actor and critic networks, along with other parameters
tf_agent = ddpg_agent.DdpgAgent(
    tf_env_train.time_step_spec(),
    tf_env_train.action_spec(),
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=critic_learning_rate),
    ou_stddev=ou_stddev, # Standard deviation for Ornstein-Uhlenbeck noise
    ou_damping=ou_damping, # Damping term for Ornstein-Uhlenbeck noise
    target_update_tau=target_update_tau, # Soft update coefficient for target networks
    target_update_period=target_update_period, # Frequency of updating target networks
    dqda_clipping=dqda_clipping, # Optional clipping of the gradient of Q-value with respect to actions
    td_errors_loss_fn=td_errors_loss_fn, # Loss function for temporal difference errors
    gamma=gamma, # Discount factor for future rewards
    reward_scale_factor=reward_scale_factor, # Scaling factor for rewards during training
    gradient_clipping=gradient_clipping, # Optional clipping of gradients during training
    debug_summaries=False, # Disable debug summaries
    summarize_grads_and_vars=False,  # Disable summarizing gradients and variables
    train_step_counter=global_step,  # Use the global step as the train step counter
)

tf_agent.initialize()

In [11]:
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    tf_agent.collect_data_spec,
    batch_size=tf_env_train.batch_size,
    max_length=replay_buffer_capacity,
)

initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
    tf_env_train,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps,
)

collect_driver = dynamic_step_driver.DynamicStepDriver(
    tf_env_train,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=collect_steps_per_iteration,
)

eval_metrics = [
    tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes)
]

global_step = tf.compat.v1.train.get_global_step()

In [12]:
import wandb
from wandb.keras import WandbCallback
wandb.login()

config={
      "batch_size": batch_size,
      "actor_learning_rate": actor_learning_rate,
      "critic_learning_rate": critic_learning_rate
      }
wandb.init(
    project="RL_Test",
    name="001",
    config=config
)

ModuleNotFoundError: No module named 'wandb.keras'

In [None]:
# For better performance
initial_collect_driver.run = common.function(initial_collect_driver.run)
collect_driver.run = common.function(collect_driver.run)
tf_agent.train = common.function(tf_agent.train)

In [None]:
# Collect initial replay data
initial_collect_driver.run()

time_step = tf_env_train.reset()
policy_state = collect_policy.get_initial_state(tf_env_train.batch_size)

In [None]:
def compute_avg_return(environment, policy, num_episodes=1):
    total_return = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward[0]
        print('Episode Return = {0}'.format(episode_return))
        total_return += episode_return
    avg_return = total_return / num_episodes
    return avg_return

In [None]:
# policy_dir = os.path.join("/", 'policy')
# tf_policy_saver = policy_saver.PolicySaver(eval_policy,1, global_step)

In [None]:
# Dataset generates trajectories with shape [Bx2x...]
# pipeline which will feed data to the agent
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2
).prefetch(3)
iterator = iter(dataset)

# Train and evaluate
while global_step.numpy() <= num_iterations:
    time_step, policy_state = collect_driver.run(
        time_step=time_step,
        policy_state=policy_state,
    )
    experience, _ = next(iterator)
    train_loss = tf_agent.train(experience)
    # actor_net.save("test")
    if (global_step.numpy() % 25 == 0):
        avg_return = compute_avg_return(tf_env_eval, eval_policy)
        print('step = {0}: loss = {1}, avg_return = {2}'.format(global_step.numpy(), train_loss.loss, avg_return))
        wandb.log({"loss": train_loss.loss, "avg_return": avg_return})
        # tf_policy_saver.save(policy_dir)
    else:
        print('step = {0}: loss = {1}'.format(global_step.numpy(), train_loss.loss))
        wandb.log({"loss": train_loss.loss})
wandb.finish()