# Deep Deterministic Policy Gradient 

This notebook implements the DDPG.

In [1]:
#Imports
import tensorflow as tf
from tf_agents.agents import ddpg
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

import sys
sys.path.insert(0, '..')
import utils.Dataloader as DL
from utils.modelgenerator import *
import utils.actorNetworkCustom as actornet
import utils.criticNetworkCustom as criticnet
import Environment

import wandb
from wandb.keras import WandbCallback
wandb.init(project="DDPG",name="001")
wandb_callback = WandbCallback()








Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjonas-sievers[0m ([33mipe[0m). Use [1m`wandb login --relogin`[0m to force relogin




### Hyperparameters

In [2]:
"""
Specifies the number of training iterations. It determines how many times the agent will go through the entire training process, 
adjusting its policy and value functions based on collected experiences. A sufficient number of iterations is crucial 
to allow the agent to learn and adapt to the environment, improving its performance over time.
"""
num_iterations = 1500
customer = 1

# Params for collect
"""
Defines the number of initial steps where the agent collects experiences randomly before the training begins. 
This helps to populate the replay buffer with diverse initial data.
A well-populated replay buffer provides a diverse set of experiences for the agent to learn from, 
enhancing the stability and effectiveness of training.
"""
initial_collect_steps = 1000

"""
Specifies the number of steps the agent takes to collect experiences in each training iteration. 
It controls the balance between exploration and exploitation during training.
Adequate exploration is necessary for discovering optimal policies. 
Adjusting this parameter impacts how often the agent explores its environment and updates its knowledge.
"""
collect_steps_per_iteration = 2000

"""
Sets the capacity of the replay buffer, a memory structure storing past experiences for the agent to sample during training.
A sufficiently large replay buffer allows the agent to store and learn from a diverse set of experiences, 
mitigating issues related to correlated data and improving sample efficiency.
"""
replay_buffer_capacity = 1000000

"""
Determines the standard deviation of the Ornstein-Uhlenbeck process, which introduces exploration noise in the action space.
Exploration noise aids the agent in exploring its action space,
preventing it from getting stuck in local optima and promoting more robust learning.
"""
ou_stddev = 0.2

"""
Introduces a damping term to the Ornstein-Uhlenbeck process, influencing the exploration noise.
Damping helps control the intensity of exploration noise, 
allowing a balance between exploration and exploitation based on the task's requirements.
"""
ou_damping = 0.15

# Params for target update
"""
Represents the soft update coefficient for updating target networks, 
determining the degree to which the target networks track the main networks.
Soft updates help stabilize training by slowly blending target values, 
preventing abrupt changes and improving the convergence of the learning process.
"""
target_update_tau = 0.05

"""
Defines how often the target networks are updated in terms of training steps.
Controlling the update frequency balances stability and responsiveness, 
preventing the target networks from lagging too far behind or updating too frequently.
"""
target_update_period = 5

# Params for train
"""
Specifies the number of gradient descent steps taken on the training batch in each training iteration.
Adjusting this parameter impacts the convergence speed of the training process, 
influencing how much the agent learns from each collected batch of experiences.
"""
train_steps_per_iteration = 1

"""
Sets the size of the training batch sampled from the replay buffer.
The batch size affects the efficiency of training; 
a well-chosen size balances computational efficiency and the stability of the learning process.
"""
batch_size = 48 * 7

"""
Specifies the learning rate for the actor (policy) network during gradient descent.
The learning rate controls the size of the step taken during optimization. 
A suitable learning rate ensures the model converges effectively without overshooting or getting stuck in local minima.
"""
actor_learning_rate = 1e-4

"""
Defines the learning rate for the critic (Q-value) network during gradient descent.
Similar to the actor learning rate, an appropriate critic learning rate influences 
the convergence and stability of the critic network, which plays a crucial role in estimating Q-values.
"""
critic_learning_rate = 1e-3

"""
An optional parameter for clipping the gradient of the Q-value with respect to actions.
Clipping gradients can prevent large updates that may destabilize training, 
acting as a form of regularization and improving the robustness of the learning process.
"""
dqda_clipping = None

"""
Specifies the loss function for temporal difference (TD) errors, 
representing the discrepancy between predicted and actual Q-values.
The choice of loss function influences how the agent updates its value estimates. 
Huber loss, as specified here, is robust to outliers and provides a balance between mean squared error and mean absolute error.
"""
td_errors_loss_fn = tf.compat.v1.losses.huber_loss

"""
Represents the discount factor applied to future rewards in the Q-value estimation.
Discounting future rewards emphasizes the importance of immediate rewards, e
nabling the agent to make more informed decisions that consider both short-term and long-term consequences.
"""
gamma = 0.99

"""
Scales the rewards during training.Scaling rewards helps to control the impact of reward magnitudes on the learning process, 
preventing issues related to overly large or small rewards.
"""
reward_scale_factor = 1.0

"""An optional parameter for clipping gradients during training."""
gradient_clipping = None

# Params for eval and checkpoints
"""
Specifies the number of episodes used for evaluating the agent's performance.
Evaluating the agent's performance provides insights into its generalization 
capabilities and allows for monitoring progress over time."""
num_eval_episodes = 1

"""
Sets the frequency (in iterations) at which evaluations are performed.
Regular evaluations help track the agent's progress, enabling the identification of potential issues and providing 
a basis for comparison between different training iterations.
"""
eval_interval = 50


### Data Handling

In [3]:
# Load data
data_train = DL.get_customer_data(DL.loadData('../../data/load1011.csv'), DL.loadPrice('../../data/price.csv'), customer)
data_eval = DL.get_customer_data(DL.loadData('../../data/load1112.csv'), DL.loadPrice('../../data/price.csv'), customer)

In [4]:
data_train[1][:3]

Unnamed: 0,0:30,1:00,1:30,2:00,2:30,3:00,3:30,4:00,4:30,5:00,...,19:30,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,0:00
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create custom models

In [5]:
m1 = ModelGenerator()
# get_dense_layers(), get_bilstm_model(), get_lstm_model(), get_cnn_lstm_model(), get_cnn_model()
custom_layers = m1.get_cnn_model()

# Create a copy of the layers, so no weights are duplicated
def copy_layer(layer):
    return tf.keras.layers.deserialize({'class_name': layer.__class__.__name__, 'config': layer.get_config()})

# Create copies of the original layers
actor_layers = [copy_layer(layer) for layer in custom_layers]
target_actor_layers = [copy_layer(layer) for layer in custom_layers]
critic_layers = [copy_layer(layer) for layer in custom_layers]
target_critic_layers = [copy_layer(layer) for layer in custom_layers]

In [6]:
# Prepare runner
# Get or create the global step variable, which is a counter for the number of training steps
global_step = tf.compat.v1.train.get_or_create_global_step()

# Create TensorFlow environments for training and evaluation using custom environment settings
tf_env_train = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=0.0, data=data_train))
tf_env_eval = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=0.0, data=data_eval))

## Define the actor network, responsible for generating actions based on observations
actor_net = actornet.ActorNetworkCustom(
    observation_spec=tf_env_train.observation_spec(),
    action_spec=tf_env_train.action_spec(),
    custom_layers=actor_layers,
    use_ensemble=False,
)

target_actor_net = actornet.ActorNetworkCustom(
    observation_spec=tf_env_train.observation_spec(),
    action_spec=tf_env_train.action_spec(),
    custom_layers=target_actor_layers,
    use_ensemble=False,
)

critic_net = criticnet.CriticNetworkCustom(
    observation_spec=tf_env_train.observation_spec(),
    action_spec=tf_env_train.action_spec(),
    custom_layers=critic_layers,
    name='CriticNetworkCustom',
)

# Optionally, you can create a target critic network for stability in DDPG
target_critic_net = criticnet.CriticNetworkCustom(
    observation_spec=tf_env_train.observation_spec(),
    action_spec=tf_env_train.action_spec(),
    custom_layers=target_critic_layers,
    name='TargetCriticNetworkCustom',
)

# Create a DDPG agent using the defined actor and critic networks, along with other parameters
tf_agent = ddpg_agent.DdpgAgent(
    tf_env_train.time_step_spec(),
    tf_env_train.action_spec(),
    actor_network=actor_net,
    target_actor_network=target_actor_net,
    critic_network=critic_net,
    target_critic_network=target_critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=critic_learning_rate),
    ou_stddev=ou_stddev, # Standard deviation for Ornstein-Uhlenbeck noise
    ou_damping=ou_damping, # Damping term for Ornstein-Uhlenbeck noise
    target_update_tau=target_update_tau, # Soft update coefficient for target networks
    target_update_period=target_update_period, # Frequency of updating target networks
    dqda_clipping=dqda_clipping, # Optional clipping of the gradient of Q-value with respect to actions
    td_errors_loss_fn=td_errors_loss_fn, # Loss function for temporal difference errors
    gamma=gamma, # Discount factor for future rewards
    reward_scale_factor=reward_scale_factor, # Scaling factor for rewards during training
    gradient_clipping=gradient_clipping, # Optional clipping of gradients during training
    debug_summaries=False, # Disable debug summaries
    summarize_grads_and_vars=False,  # Disable summarizing gradients and variables
    train_step_counter=global_step,  # Use the global step as the train step counter
)

tf_agent.initialize()

### Initialize DDPG

In [7]:
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    tf_agent.collect_data_spec,
    batch_size=tf_env_train.batch_size,
    max_length=replay_buffer_capacity,
)

initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
    tf_env_train,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps,
)

collect_driver = dynamic_step_driver.DynamicStepDriver(
    tf_env_train,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=collect_steps_per_iteration,
)

train_checkpointer = common.Checkpointer(
    ckpt_dir='checkpoints/ddpg' + str(customer) + '/',
    max_to_keep=1,
    agent=tf_agent,
    policy=tf_agent.policy,
    replay_buffer=replay_buffer,
    global_step=global_step
)

eval_summary_writer = tf.compat.v2.summary.create_file_writer(
    logdir='./log/ddpg' + str(customer) + '/', flush_millis=10000
)

eval_metrics = [
    tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes)
]

train_checkpointer.initialize_or_restore()
global_step = tf.compat.v1.train.get_global_step()

In [8]:
# For better performance
initial_collect_driver.run = common.function(initial_collect_driver.run)
collect_driver.run = common.function(collect_driver.run)
tf_agent.train = common.function(tf_agent.train)

In [9]:
# Collect initial replay data
initial_collect_driver.run()

time_step = tf_env_train.reset()
policy_state = collect_policy.get_initial_state(tf_env_train.batch_size)

  load = self._load_data.iloc[self._current_day][self._current_timeslot]
  pv = self._pv_data.iloc[self._current_day][self._current_timeslot]
  self._electricity_prices.iloc[(self._current_day * self._max_timeslots) + self._current_timeslot][0]


### Run DDPG

In [15]:
# Dataset generates trajectories with shape [Bx2x...]
# pipeline which will feed data to the agent
dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
iterator = iter(dataset)
with tf.compat.v2.summary.record_if(True):
    
    metric_utils.eager_compute(
        eval_metrics,
        tf_env_eval,
        eval_policy,
        num_episodes=num_eval_episodes, #1
        train_step=global_step, #
        summary_writer=eval_summary_writer,
        summary_prefix='Metrics'
    )
    
    # Train and evaluate
    while global_step.numpy() <= num_iterations:
        
        time_step, policy_state = collect_driver.run(time_step=time_step,policy_state=policy_state,)
        
        experience, _ = next(iterator)
        
        train_loss = tf_agent.train(experience)
        print("Train loss: ", train_loss)
        print("Train loss Numpy: ", train_loss.loss.numpy())
        # Log metrics using WandbCallback
        logs = {'train_loss': train_loss.loss.numpy()}  # Add more metrics as needed
        wandb_callback.on_epoch_end(global_step.numpy(), logs)
        
        print('step = {0}: Loss = {1}'.format(global_step.numpy(), train_loss.loss))
        with eval_summary_writer.as_default():
            tf.summary.scalar(name='loss', data=train_loss.loss, step=global_step)
        if global_step.numpy() % eval_interval == 0:
            train_checkpointer.save(global_step)
            metric_utils.eager_compute(
                eval_metrics,
                tf_env_eval,
                eval_policy,
                num_episodes=num_eval_episodes,
                train_step=global_step,
                summary_writer=eval_summary_writer,
                summary_prefix='Metrics')
            
wandb.finish()

  load = self._load_data.iloc[self._current_day][self._current_timeslot]
  pv = self._pv_data.iloc[self._current_day][self._current_timeslot]
  self._electricity_prices.iloc[(self._current_day * self._max_timeslots) + self._current_timeslot][0]


AttributeError: No TensorFlow session-like object was provided, and none could be retrieved using 'tf.get_default_session()'.
  In call to configurable 'compute_summaries' (<function compute_summaries at 0x00000163EB434700>)

In [14]:
# Test
data_test = DL.get_customer_data(DL.loadData('../../data/load1213.csv'),
                                         DL.loadPrice('../../data/price.csv'), customer)
tf_env_test = tf_py_environment.TFPyEnvironment(Environment.Environment(init_charge=0.0, data=data_test, test=True))
time_step_test = tf_env_test.reset()

while not time_step_test.is_last():
    action_step = tf_agent.policy.action(time_step_test)
    time_step_test = tf_env_test.step(action_step.action)

  load = self._load_data.iloc[self._current_day][self._current_timeslot]
  pv = self._pv_data.iloc[self._current_day][self._current_timeslot]
  self._electricity_prices.iloc[(self._current_day * self._max_timeslots) + self._current_timeslot][0]
