# DDPG
Building a Deep Deterministic Policy Gradient (DDPG) algorithm for battery scheduling involves several key components. Here's a brief overview of the main components and a sample TensorFlow implementation.

Main Components of DDPG for Battery Scheduling:
State Space: This includes the State of Energy (SoE) of the battery, Load demand, Electricity Price, and Photovoltaic (PV) output. These factors collectively represent the current situation and are input to the neural network.

- **Action Space**: This is the (dis)charging current of the battery, which is the output of the policy network. The action will be a continuous value, determining how much to charge or discharge the battery.

- **Actor Network**: This neural network approximates the policy. It takes the state as input and outputs the best believed action to maximize future rewards.

- **Critic Network**: This network estimates the value function. It takes both the state and action as input and outputs a Q-value, representing the expected future rewards of taking that action in that state.

- **Replay Buffer**: This is a data structure used to store and recall experience tuples (state, action, reward, next state). It helps in breaking the correlation between consecutive training samples.

- **Target Networks**: These are copies of the actor and critic networks, used to stabilize training. They are slowly updated to track the learned networks.

- **Exploration Strategy**: Since DDPG is an off-policy algorithm, an exploration strategy like adding noise to the action is often used to explore the action space effectively.

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
from collections import deque
import time




In [2]:
# Hyperparameters
state_dim = 4  # [SoE, Load, Price, PV]
action_dim = 1  # (dis)charging current
actor_lr = 0.001
critic_lr = 0.002
gamma = 0.99
tau = 0.005
buffer_size = 100000
minibatch_size = 64
num_episodes = 10
iterations = 17520

In [3]:
import sys
sys.path.insert(0, '..')
import utils.dataloader as DL

data_train = DL.get_customer_data(DL.loadData('../../data/load1011.csv'), DL.loadPrice('../../data/price.csv'), customer=1)
data_eval = DL.get_customer_data(DL.loadData('../../data/load1112.csv'), DL.loadPrice('../../data/price.csv'), customer=1)

def reshape_data(df):
    # Reshape the DataFrame from a day-wise format to a continuous series format
    reshaped_data = df.values.flatten()  # Flatten the day-wise data into a single array
    return pd.DataFrame(reshaped_data)  # Convert back to DataFrame for consistency

# Assuming data_train[0], data_train[1], and data_train[2] are your load, PV, and price data respectively
load_data_reshaped = reshape_data(data_train[0])
pv_data_reshaped = reshape_data(data_train[1])
price_data_reshaped = pd.DataFrame(data_train[2].values)

print("Load data values {}, PV values {}, Price values {}".format(len(load_data_reshaped), len(pv_data_reshaped), len(price_data_reshaped)))

TypeError: get_customer_data() missing 1 required positional argument: 'dfmix'

In [4]:
class BatteryEnvironment:
    def __init__(self, load_data, pv_data, price_data, max_battery, eta=0.95):
        self.load_data = load_data
        self.pv_data = pv_data
        self.price_data = price_data
        self.max_battery = max_battery
        self.eta = eta  # Efficiency
        self.current_step = 0

    def reset(self):
        self.current_step = 0
        self.current_battery = self.max_battery / 2  # Start with half charge
        return self._next_observation()

    
    def _next_observation(self):
        # Access the current state's load, PV, and price
        load = self.load_data[self.current_step][0]
        pv = self.pv_data[self.current_step][0]
        price = self.price_data[self.current_step][0]

        # Normalize the state
        state = np.array([
            self.current_battery / self.max_battery,
            load / np.max(self.load_data),
            price / np.max(self.price_data),
            pv / np.max(self.pv_data)
        ], dtype=np.float32)

        return state
    
    def step(self, action):
        # Implement the battery dynamics
        battery_change = action * self.max_battery
        self.current_battery += battery_change * self.eta
        self.current_battery = np.clip(self.current_battery, 0, self.max_battery)

        # Ensure current_battery is a scalar
        self.current_battery = self.current_battery.item() if isinstance(self.current_battery, np.ndarray) else self.current_battery
        
        # Calculate the reward
        reward = -abs(self.load_data[self.current_step] - battery_change) * self.price_data[self.current_step]

        # Ensure reward is a scalar (convert it to a single value)
        reward = reward.item() if isinstance(reward, np.ndarray) else reward

        self.current_step += 1
        done = self.current_step >= len(self.load_data)

        return self._next_observation(), reward, done, {}
    
# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def size(self):
        return len(self.buffer)

# Actor Network
class Actor(tf.keras.Model):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = tf.keras.layers.Dense(256, activation='relu')
        self.fc2 = tf.keras.layers.Dense(256, activation='relu')
        self.fc3 = tf.keras.layers.Dense(action_dim, activation='tanh')  # Scaled to action space

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        return self.fc3(x)

# Critic Network
class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(256, activation='relu')
        self.fc2 = tf.keras.layers.Dense(256, activation='relu')
        self.fc3 = tf.keras.layers.Dense(1)

    def call(self, state, action):
        x = self.fc1(tf.concat([state, action], axis=1))
        x = self.fc2(x)
        return self.fc3(x)

In [5]:
# Initialize Networks
actor = Actor()
critic = Critic()
target_actor = Actor()
target_critic = Critic()
target_actor.set_weights(actor.get_weights())
target_critic.set_weights(critic.get_weights())

# Initialize the environment
max_battery_capacity = 100  # Example capacity, adjust as needed
env = BatteryEnvironment(load_data_reshaped.values, pv_data_reshaped.values, price_data_reshaped.values, max_battery_capacity)




In [6]:
# Optimizers
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)

# Replay Buffer
replay_buffer = ReplayBuffer(buffer_size)

In [7]:
# Function to update the networks
def update_networks(tau):
    new_actor_weights = []
    actor_weights = actor.get_weights()
    target_actor_weights = target_actor.get_weights()
    for aw, taw in zip(actor_weights, target_actor_weights):
        new_actor_weights.append(aw * tau + taw * (1 - tau))
    target_actor.set_weights(new_actor_weights)

    new_critic_weights = []
    critic_weights = critic.get_weights()
    target_critic_weights = target_critic.get_weights()
    for cw, tcw in zip(critic_weights, target_critic_weights):
        new_critic_weights.append(cw * tau + tcw * (1 - tau))
    target_critic.set_weights(new_critic_weights)

In [8]:
def train():
    for episode in range(num_episodes):
        print("Episode: ", episode)  
        state = env.reset()
        episode_reward = 0

        start_time = time.time()  # To measure the time taken for each episode

        for t in range(iterations-1):  # 48 time steps per day * 365 days
            action = actor(tf.convert_to_tensor([state], dtype=tf.float32))
            noise = np.random.normal(0, 0.1, size=action_dim)
            action = np.clip(action.numpy() + noise, -1, 1)
            action = np.squeeze(action, axis=-1)
            next_state, reward, done, _ = env.step(action[0])
            replay_buffer.add(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            if replay_buffer.size() > minibatch_size:
                minibatch = replay_buffer.sample(minibatch_size)
                states, actions, rewards, next_states, dones = map(np.array, zip(*minibatch))  # Convert to arrays

                # Critic Update
                with tf.GradientTape() as tape:
                    target_actions = target_actor(next_states)
                    target_critic_value = target_critic(next_states, target_actions, training=True)
                    
                    y = rewards + gamma * target_critic_value * (1 - dones)
                    critic_value = critic(states, actions, training=True)
                    critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

                critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
                critic_optimizer.apply_gradients(zip(critic_grad, critic.trainable_variables))

                # Actor Update
                with tf.GradientTape() as tape:
                    actions = actor(states)
                    critic_value = critic(states, actions, training=True)
                    # We want to maximize the critic value, hence minimize the negative of critic_value
                    actor_loss = -tf.math.reduce_mean(critic_value)

                actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
                actor_optimizer.apply_gradients(zip(actor_grad, actor.trainable_variables))

                # Update target networks
                update_networks(tau)

            if done:
                break

        end_time = time.time()
        print(f"Episode {episode + 1}, Reward: {episode_reward}, Time taken: {end_time - start_time} seconds")

train()

Episode:  0
Episode 1, Reward: -13705.569631635079, Time taken: 528.8077805042267 seconds
Episode:  1
Episode 2, Reward: -15403.587732426215, Time taken: 522.5618104934692 seconds
Episode:  2
Episode 3, Reward: -18799.297346016396, Time taken: 510.07124161720276 seconds
Episode:  3
Episode 4, Reward: -16970.971115304466, Time taken: 504.63184452056885 seconds
Episode:  4
Episode 5, Reward: -25333.424582508494, Time taken: 506.2705578804016 seconds
Episode:  5
Episode 6, Reward: -26437.93516378536, Time taken: 511.16979479789734 seconds
Episode:  6


KeyboardInterrupt: 