In [1]:
# Imports
import os
import logging
import pandas as pd

logging.getLogger("wandb").setLevel(logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.ERROR)

os.environ['WANDB_SILENT'] = 'true'
os.environ['WANDB_CONSOLE'] = 'off'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment, py_environment, batched_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
import matplotlib.pyplot as plt
import wandb

import sys
sys.path.insert(0, '..')
from environments.EnergyManagementEnv import EnergyManagementEnv
from utils.agentNetworks import ActorNetwork, CriticNetwork, CustomLayers
#import utils.dataloader as DL








In [2]:
# Load data and setup environments
def get_energy_dataset():
    # Load data
    price_df = pd.read_csv("../../data/1process_data/processed_price.csv", header=0)
    fuel_df = pd.read_csv("../../data/1process_data/processed_fuelmix.csv", header=0)
    pv_df = pd.read_csv("../../data/1process_data/2010-2013 PV_processed.csv", header=0)
    totalload_df = pd.read_csv("../../data/1process_data/2010-2013 Totalload_processed.csv", header=0)
    grossload_df = pd.read_csv("../../data/1process_data/2010-2013 Grossload_processed.csv", header=0)

    #Rename
    pv_df.columns = [col.replace('User', 'pv_') for col in pv_df.columns]
    totalload_df.columns = [col.replace('User', 'load_') for col in totalload_df.columns]
    grossload_df.columns = [col.replace('User', 'load_') for col in grossload_df.columns]
    fuel_df.rename(columns={"0": 'Fuelmix'}, inplace=True)

    #Concat to final df
    final_df = pd.DataFrame()
    final_df["price"] = price_df["Price"]
    final_df["fuelmix"] = fuel_df["Fuelmix"]
    final_df = pd.concat([final_df, totalload_df, pv_df], axis=1)

    return final_df

num_buildings = 30
energy_data = get_energy_dataset()
energy_data.set_index('Date', inplace=True)
energy_data.fillna(0, inplace=True)

dataset = {"train": {}, "eval": {}, "test": {}}
environments = {"train": {}, "eval": {}, "test": {}}
for idx in range(num_buildings):
    user_data = energy_data[[f'load_{idx+1}', f'pv_{idx+1}', 'price', 'fuelmix']]
    
    dataset["train"][f"building_{idx+1}"] = user_data[0:17520].set_index(pd.RangeIndex(0,17520))
    dataset["eval"][f"building_{idx+1}"] = user_data[17520:35088].set_index(pd.RangeIndex(0,17568))
    dataset["test"][f"building_{idx+1}"] = user_data[35088:52608].set_index(pd.RangeIndex(0,17520))

    environments["train"][f"building_{idx+1}"] = tf_py_environment.TFPyEnvironment(EnergyManagementEnv(init_charge=0.0, data=dataset["train"][f"building_{idx+1}"]))
    environments["eval"][f"building_{idx+1}"] = tf_py_environment.TFPyEnvironment(EnergyManagementEnv(init_charge=0.0, data=dataset["eval"][f"building_{idx+1}"]))
    environments["test"][f"building_{idx+1}"] = tf_py_environment.TFPyEnvironment(EnergyManagementEnv(init_charge=0.0, data=dataset["test"][f"building_{idx+1}"], logging=True))

print("Batch size: ", environments["train"][f"building_1"].batch_size)
print("State Space: {}, Action Space: {}".format(environments["train"][f"building_1"].observation_spec().shape[0], environments["train"][f"building_1"].action_spec().shape[0])) #SoE, price, price forecast 1-6
print("Upper bound: {}".format(round(environments["train"][f"building_1"].action_spec().maximum.item(), 3)))
dataset["test"][f"building_1"].head(1)

Batch size:  1
State Space: 6, Action Space: 1
Upper bound: 2.3


Unnamed: 0,load_1,pv_1,price,fuelmix
0,1.149,0.0,0.05704,0.530991


In [3]:
from tf_agents.agents import ddpg

def get_ddpg_agent(observation_spec, action_spec, custom_layers, global_step): 
    
    """actor_net = ActorNetwork(observation_spec=observation_spec, action_spec=action_spec, custom_layers=custom_layers)

    critic_net = CriticNetwork(observation_spec=observation_spec, action_spec=action_spec, custom_layers=custom_layers)

    target_actor_network = ActorNetwork(observation_spec=observation_spec, action_spec=action_spec, custom_layers=custom_layers)

    target_critic_network = CriticNetwork(observation_spec=observation_spec, action_spec=action_spec, custom_layers=custom_layers)"""

    actor_net = ddpg.actor_network.ActorNetwork(
        input_tensor_spec=environments["train"][f"building_{idx+1}"].observation_spec(),
        output_tensor_spec=environments["train"][f"building_{idx+1}"].action_spec(), fc_layer_params=(32, 32),
        activation_fn=tf.keras.activations.relu)
     
    critic_net = ddpg.critic_network.CriticNetwork(
        input_tensor_spec=(environments["train"][f"building_{idx+1}"].observation_spec(), environments["train"][f"building_{idx+1}"].action_spec()),
        joint_fc_layer_params=(32, 32),
        activation_fn=tf.keras.activations.relu)

    target_actor_network = ddpg.actor_network.ActorNetwork(
        input_tensor_spec=environments["train"][f"building_{idx+1}"].observation_spec(),
        output_tensor_spec=environments["train"][f"building_{idx+1}"].action_spec(), fc_layer_params=(32, 32),
        activation_fn=tf.keras.activations.relu)

    target_critic_network = ddpg.critic_network.CriticNetwork(
        input_tensor_spec=(environments["train"][f"building_{idx+1}"].observation_spec(), environments["train"][f"building_{idx+1}"].action_spec()),
        joint_fc_layer_params=(32, 32),
        activation_fn=tf.keras.activations.relu)

    agent_params = {
        "time_step_spec": environments["train"][f"building_{idx+1}"].time_step_spec(),
        "action_spec": environments["train"][f"building_{idx+1}"].action_spec(),
        "actor_network": actor_net,
        "critic_network": critic_net,
        "actor_optimizer": tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3),
        "critic_optimizer": tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2),
        "ou_stddev": 0.9,
        "ou_damping": 0.3,
        "target_actor_network": target_actor_network,
        "target_critic_network": target_critic_network,
        "target_update_tau": 0.05,
        "target_update_period": 5,
        "dqda_clipping": None,
        "td_errors_loss_fn": tf.compat.v1.losses.huber_loss,
        "gamma": 0.99,
        "reward_scale_factor": 1.0,
        "train_step_counter": global_step,
    }

    # Create the DdpgAgent with unpacked parameters
    tf_agent = ddpg_agent.DdpgAgent(**agent_params)

    tf_agent.initialize()
    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    return tf_agent, eval_policy, collect_policy

In [4]:
def initialize_wandb_logging(project="DDPG_battery_testing", name="Exp", num_iterations=1500, batch_size=1, a_lr="1e-4", c_lr="1e-3"):
    wandb.login()
    wandb.init(
        project="DDPG_battery_testing",
        job_type="train_eval_test",
        name=name,
        config={
            "train_steps": num_iterations,
            "batch_size": batch_size,
            "actor_learning_rate": 1e-4,
            "critic_learning_rate": 1e-3}
    )
    artifact = wandb.Artifact(name='save', type="checkpoint")

    """train_checkpointer = common.Checkpointer(
            ckpt_dir='checkpoints/ddpg/',
            max_to_keep=1,
            agent=tf_agent,
            policy=tf_agent.policy,
            replay_buffer=replay_buffer,
            global_step=global_step
        )
        train_checkpointer.initialize_or_restore()"""

    return artifact

In [5]:
def setup_rl_training_pipeline(tf_agent, env_train, replay_buffer_capacity,collect_policy, initial_collect_steps, collect_steps_per_iteration, batch_size):
    
    #Setup replay buffer -> TFUniform to give each sample an equal selection chance
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=tf_agent.collect_data_spec,
            batch_size= env_train.batch_size,
            max_length=replay_buffer_capacity,
        )

    # Populate replay buffer with inital experience before actual training (for num_steps times)
    initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
        env=env_train,
        policy=collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=initial_collect_steps,
    )

    # After the initial collection phase, the collect driver takes over for the continuous collection of data during the training process
    collect_driver = dynamic_step_driver.DynamicStepDriver(
        env=env_train,
        policy=collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=collect_steps_per_iteration,
    )

    # For better performance
    initial_collect_driver.run = common.function(initial_collect_driver.run)
    collect_driver.run = common.function(collect_driver.run)
    tf_agent.train = common.function(tf_agent.train)

    # Collect initial replay data
    initial_collect_driver.run()
    time_step = env_train.reset()
    policy_state = collect_policy.get_initial_state(env_train.batch_size)

    # The dataset is created from the replay buffer in a more structured and efficient way to provide mini-batches
    dataset = replay_buffer.as_dataset(
        num_parallel_calls=tf.data.experimental.AUTOTUNE, 
        sample_batch_size=batch_size, num_steps=2).prefetch(tf.data.experimental.AUTOTUNE)
    
    #Feed batches of experience to the agent for training
    iterator = iter(dataset)

    return iterator, collect_driver, time_step, policy_state

In [6]:
# Set a fixed random seed for reproducibility
SEED = 42
tf.random.set_seed(SEED)

# Setup Agent networks
batch_size = 128
replay_buffer_capacity = 20000 #Before: 1000000 -> But only <18.000 samples per dataset
initial_collect_steps = 2000
collect_steps_per_iteration = 20 
num_iterations = 10000 
eval_interval = 6000

num_rounds = 1
for idx in range(num_buildings):
    for round in range(num_rounds):

        #Get or create global step for training round and building
        global_step = tf.compat.v1.train.get_or_create_global_step()
        
        #Setup agent
        tf_agent, eval_policy, collect_policy = get_ddpg_agent(
            observation_spec = environments["train"][f"building_{idx+1}"].observation_spec(),
            action_spec = environments["train"][f"building_{idx+1}"].action_spec(),
            custom_layers = [CustomLayers.get_dense_layers(layers=2, units=32)],
            global_step = global_step
            )

        
        #Setup iterator, replay buffer, driver
        iterator, collect_driver, time_step, policy_state = setup_rl_training_pipeline(
            tf_agent, environments["train"][f"building_{idx+1}"], replay_buffer_capacity,collect_policy, initial_collect_steps, collect_steps_per_iteration, batch_size
            )

        #Setup wandb logging
        artifact = initialize_wandb_logging(name=f"Exp_Dense2x128_building{idx+1}_rd{round+1}", num_iterations=num_iterations)
        
        # Train and evaluate
        eval_metrics = [tf_metrics.AverageReturnMetric()]
        test_metrics = [tf_metrics.AverageReturnMetric()]

        print(f"Start training building {idx+1} - Round {round+1}")
        while global_step.numpy() < num_iterations:

            if global_step.numpy() % 50 == 0:
                print(global_step.numpy(), "/ ", num_iterations)

            time_step, policy_state = collect_driver.run(time_step=time_step, policy_state=policy_state)
            experience, _ = next(iterator)
            train_loss = tf_agent.train(experience)
            
            metrics = {}
            if global_step.numpy() % eval_interval == 0:
                #train_checkpointer.save(global_step)
                metrics = metric_utils.eager_compute(eval_metrics,environments["eval"][f"building_{idx+1}"],
                    eval_policy,num_episodes=1,train_step=global_step,summary_writer=None,summary_prefix='',use_function=True)
            
            if global_step.numpy() % 2 == 0:
                metrics["loss"] = train_loss.loss
                wandb.log(metrics)

        print("Start testing ...")
        metrics = metric_utils.eager_compute(test_metrics,environments["test"][f"building_{idx+1}"],eval_policy,num_episodes=1)
        wandb.log(metrics)
        #artifact.add_dir(local_path='checkpoints/ddpg/')
        wandb.log_artifact(artifact)
        wandb.finish()
        tf.compat.v1.reset_default_graph()

Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.
Start training building 1 - Round 1
0 /  10000
50 /  10000
100 /  10000
150 /  10000
200 /  10000
250 /  10000
300 /  10000
350 /  10000
400 /  10000
450 /  10000
500 /  10000
550 /  10000
600 /  10000
650 /  10000
700 /  10000
750 /  10000
800 /  10000
850 /  10000
900 /  10000
950 /  10000
1000 /  10000
1050 /  10000
1100 /  10000
1150 /  10000
1200 /  10000
1250 /  10000
1300 /  10000
1350 /  10000
1400 /  10000
1450 /  10000
1500 /  10000
1550 /  10000
1600 /  10000
1650 /  10000
1700 /  10000
1750 /  10000
1800 /  10000
1850 /  10000
1900 /  10000
1950 /  10000
2000 /  10000
2050 /  10000
2100 /  10000
2150 /  10000
2200 /  10000
2250 /  10000
2300 /  10000
2350 /  10000
2400 /  10000
2450 /  10000
2500 /  10000
2550 /  10000
2600 /  10000
2650 /  10000
2700 /  10000
2750 /  10000
2800 /  10000
2850 /  10000
2900 /  10000
