In [28]:
import gym
from gym import spaces
import numpy as np

class BatteryManagementEnv(gym.Env):
    """
    A battery management environment for use with OpenAI Gym.
    Actions:
        0: Charge the battery
        1: Discharge the battery
        2: Do nothing (hold)
    State:
        A single number representing the normalized charge level of the battery (0 to 1)
    """
    def __init__(self):
        super(BatteryManagementEnv, self).__init__()
        self.action_space = spaces.Discrete(3)  # [Charge, Discharge, Hold]
        self.observation_space = spaces.Box(low=np.array([0]), high=np.array([1]), dtype=np.float32)
        
        self.max_charge = 100.0  # Maximum charge of the battery
        self.charge_level = self.max_charge / 2  # Start with the battery half charged
        
        # Define reward parameters here as needed

    def step(self, action):
        # Implement the effect of the action
        if action == 0:  # Charge
            self.charge_level = min(self.charge_level + 10, self.max_charge)
        elif action == 1:  # Discharge
            self.charge_level = max(self.charge_level - 10, 0)
        
        # Normalize the charge level to [0, 1] for the observation
        normalized_charge = self.charge_level / self.max_charge
        self.state = np.array([normalized_charge])
        
        # Calculate reward (this is a placeholder, adjust based on your needs)
        reward = 1.0 - abs(0.5 - normalized_charge)  # Example: higher reward for being around 50% charge
        
        # Determine if the episode is done (e.g., a day has passed)
        done = False  # Implement your own condition for ending an episode
        
        # Optionally include additional info
        info = {}
        
        return self.state, reward, done, info

    def reset(self):
        # Reset the battery level to a random state
        self.charge_level = np.random.uniform(0, self.max_charge)
        normalized_charge = self.charge_level / self.max_charge
        self.state = np.array([normalized_charge])
        return self.state

    def render(self, mode='console'):
        if mode != 'console':
            raise NotImplementedError("Only console mode is supported.")
        print(f"Charge Level: {self.charge_level}/{self.max_charge}")

    def close(self):
        pass


In [29]:
import tensorflow as tf
from tensorflow.keras import layers

def create_q_model(num_states, num_actions):
    """Builds a simple Deep Q-Network."""
    inputs = layers.Input(shape=(num_states,))
    layer1 = layers.Dense(32, activation="relu")(inputs)
    #layer2 = layers.Dense(128, activation="relu")(layer1)
    action = layers.Dense(num_actions, activation="linear")(layer1)
    
    return tf.keras.Model(inputs=inputs, outputs=action)


In [31]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.optimizers import Adam

env = BatteryManagementEnv()  # Assuming this is already defined
num_states = env.observation_space.shape[0]
print(num_states)
num_actions = env.action_space.n
print(num_actions)

model = create_q_model(num_states, num_actions)
model_target = create_q_model(num_states, num_actions)  # For target network
model.compile(optimizer=Adam(learning_rate=1e-3), loss="mse")

# Parameters
gamma = 0.95  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 16
memory = deque(maxlen=100)  # Experience replay memory

# Training loop
for episode in range(10):
    state = np.reshape(env.reset(), [1, num_states])
    done = False
    total_reward = 0
    
    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(state)
            action = np.argmax(q_values[0])
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, num_states])
        
        # Store experience in replay memory
        memory.append((state, action, reward, next_state, done))
        
        state = next_state
        total_reward += reward
        
        # Experience replay
        if len(memory) > batch_size:
            minibatch = random.sample(memory, batch_size)
            for state, action, reward, next_state, done in minibatch:
                target = reward
                if not done:
                    target = reward + gamma * np.amax(model_target.predict(next_state)[0])
                target_f = model.predict(state)
                target_f[0][action] = target
                
                model.fit(state, target_f, epochs=1, verbose=0)

        done = True
        
    # Update epsilon
    epsilon = max(epsilon_min, epsilon_decay * epsilon)
    
    # Update the target network with weights from the model
    model_target.set_weights(model.get_weights())

    print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon}")

1
3
Episode: 1, Total Reward: 0.6149870314478934, Epsilon: 0.995
Episode: 2, Total Reward: 0.9629049108279807, Epsilon: 0.990025
Episode: 3, Total Reward: 0.995746988234752, Epsilon: 0.985074875
Episode: 4, Total Reward: 0.8594474149271518, Epsilon: 0.9801495006250001
Episode: 5, Total Reward: 0.5, Epsilon: 0.9752487531218751
Episode: 6, Total Reward: 0.6992522930463226, Epsilon: 0.9703725093562657
Episode: 7, Total Reward: 0.6960650025240978, Epsilon: 0.9655206468094844
Episode: 8, Total Reward: 0.5, Epsilon: 0.960693043575437
Episode: 9, Total Reward: 0.8176512385803614, Epsilon: 0.9558895783575597
Episode: 10, Total Reward: 0.8896180859525925, Epsilon: 0.9511101304657719
