In [1]:
import random
import numpy as np
import gymnasium as gym

from gymnasium import Env
from gymnasium.spaces import Discrete, Box

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [2]:
env = gym.make("CartPole-v1")
status = env.action_space
actions = env.observation_space.shape[0]

In [20]:
class ShowerEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Temperature array
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start temp
        self.state = 38 + random.randint(-3,3)
        # Set shower length
        self.shower_length = 60
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 temperature
        # 1 -1 = 0 
        # 2 -1 = 1 temperature 
        self.state += action -1 
        # Reduce shower length by 1 second
        self.shower_length -= 1 
        
        # Calculate reward
        if self.state >=37 and self.state <=39: 
            reward =1 
        else: 
            reward = -1 
        
        # Check if shower is done
        if self.shower_length <= 0: 
            done = True
        else:
            done = False
        
        # Apply temperature noise
        #self.state += random.randint(-1,1)
        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self, mode):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = 38 + random.randint(-3,3)
        # Reset shower time
        self.shower_length = 60 
        return self.state
    

In [21]:
env = ShowerEnv()

In [22]:
env.observation_space.sample()

array([81.15043], dtype=float32)

In [23]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:0
Episode:2 Score:-44
Episode:3 Score:-42
Episode:4 Score:-26
Episode:5 Score:-22
Episode:6 Score:6
Episode:7 Score:-8
Episode:8 Score:-58
Episode:9 Score:-40
Episode:10 Score:8


In [24]:
states = env.observation_space.shape
actions = env.action_space.n

In [25]:
actions

3

In [26]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [27]:
model = build_model(states, actions)

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 24)                48        
                                                                 
 dense_4 (Dense)             (None, 24)                600       
                                                                 
 dense_5 (Dense)             (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [29]:
env.observation_space

Box(0.0, 100.0, (1,), float32)

   # Build agent with Keras-RL

In [30]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [31]:
dqn = build_agent(model, actions)


In [32]:
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

In [33]:
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
166 episodes - episode_reward: -32.000 [-60.000, 26.000] - loss: 0.913 - mae: 5.674 - mean_q: -7.402

Interval 2 (10000 steps performed)
167 episodes - episode_reward: -26.623 [-60.000, 34.000] - loss: 1.930 - mae: 9.667 - mean_q: -13.791

Interval 3 (20000 steps performed)
167 episodes - episode_reward: -21.904 [-60.000, 42.000] - loss: 1.695 - mae: 8.786 - mean_q: -12.442

Interval 4 (30000 steps performed)
166 episodes - episode_reward: 6.723 [-60.000, 54.000] - loss: 0.975 - mae: 5.450 - mean_q: -7.037

Interval 5 (40000 steps performed)
done, took 959.947 seconds


<keras.callbacks.History at 0x236eb6e38b0>

In [34]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 60.000, steps: 60
Episode 2: reward: 60.000, steps: 60
Episode 3: reward: 60.000, steps: 60
Episode 4: reward: 60.000, steps: 60
Episode 5: reward: 58.000, steps: 60
Episode 6: reward: 58.000, steps: 60
Episode 7: reward: 60.000, steps: 60
Episode 8: reward: 58.000, steps: 60
Episode 9: reward: 60.000, steps: 60
Episode 10: reward: 58.000, steps: 60
Episode 11: reward: 60.000, steps: 60
Episode 12: reward: 60.000, steps: 60
Episode 13: reward: 60.000, steps: 60
Episode 14: reward: 58.000, steps: 60
Episode 15: reward: 60.000, steps: 60
Episode 16: reward: 60.000, steps: 60
Episode 17: reward: 60.000, steps: 60
Episode 18: reward: 60.000, steps: 60
Episode 19: reward: 60.000, steps: 60
Episode 20: reward: 58.000, steps: 60
Episode 21: reward: 60.000, steps: 60
Episode 22: reward: 60.000, steps: 60
Episode 23: reward: 58.000, steps: 60
Episode 24: reward: 58.000, steps: 60
Episode 25: reward: 60.000, steps: 60
Episode 26: reward: 60.000, st

In [35]:
_ = dqn.test(env, nb_episodes=15, visualize=True)

Testing for 15 episodes ...
Episode 1: reward: 60.000, steps: 60
Episode 2: reward: 58.000, steps: 60
Episode 3: reward: 60.000, steps: 60
Episode 4: reward: 58.000, steps: 60
Episode 5: reward: 60.000, steps: 60
Episode 6: reward: 60.000, steps: 60
Episode 7: reward: 58.000, steps: 60
Episode 8: reward: 60.000, steps: 60
Episode 9: reward: 60.000, steps: 60
Episode 10: reward: 58.000, steps: 60
Episode 11: reward: 60.000, steps: 60
Episode 12: reward: 60.000, steps: 60
Episode 13: reward: 58.000, steps: 60
Episode 14: reward: 60.000, steps: 60
Episode 15: reward: 60.000, steps: 60


# Reloading Agent from Memory

In [36]:
dqn.save_weights('dqn_weights.h5f', overwrite=True)

In [37]:
del model
del dqn
del env

In [39]:
env = gym.make('CartPole-v1')
actions = env.action_space.n
states = env.observation_space.shape[0]

model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

In [40]:
dqn.load_weights('dqn_weights.h5f')

ValueError: Layer weight shape (4, 24) not compatible with provided weight shape (1, 24)

In [None]:
_ = dqn.test(env, nb_episodes=5, visualize=True)